def generate_csv(self) -> None: logger.debug('generating rag csv: %s', self.name) to_write = [] for idx, book in enumerate(iter_books()): if len(to_write) >= self.max_lines: break if self.includes is not None: if self.includes not in book.raw_content.lower(): continue for sentence in book.sentences: if len(to_write) >= self.max_lines: break if not sentence.strip(): continue for sub_sentence in sentence.split('\n'): if not sub_sentence.strip(): continue if len(sub_sentence.split()) < 2 or len( sub_sentence.split()) > 30: continue if self.includes is not None: if self.includes in sub_sentence.lower(): to_write.append({ 'title': book.title, 'text': sub_sentence.strip(), }) else: to_write.append({ 'title': book.title, 'text': sub_sentence.strip(), }) pandas.DataFrame(to_write).to_csv(self.csv_path, index=False)
def process(self, verbose=False) -> None: for idx, book in enumerate(iter_books()): # Maybe only update prev if update_groups did anything. # Doesn't really matter self.update_groups(book) if idx % self.log_rate == 0: self.log_stats(verbose=verbose)
def load_tagged_docs(self) -> None: # TODO: change this, it's a bit bookey rather than texty processed = 0 for item in iter_books(): if len(self.tagged_docs.items) >= self.num_items: break self.tagged_docs.load(item) processed += 1
def get_wikisearch_people(stop_after=1000): people = {} with open(AUTHOR_METADATA_PATH, 'r') as f: for k, v in json.loads(f.read()).items(): people[k] = clean_person(v) books = [] for idx, book in enumerate(iter_books()): if idx == stop_after: break if book.author.name in people.keys(): if people[book.author.name]['born'] is not None and people[ book.author.name]['nationality'] is not None: continue books.append(book) people_responses = [] with ThreadPool(processes=20) as pool: people_responses.extend(pool.map(get_wikipedia_person_by_book, books)) people.update({p[0]: p[1] for p in people_responses}) return people
def test_iter_books_goodreads(self): books = [b for b in iter_books()] self.assertEqual(len(books), 3) self.assertEqual(sorted([b.title for b in books]), ['Title 1', 'Title 2', 'Title 3'])