コード例 #1
0
ファイル: model.py プロジェクト: RobertLucey/mauve
    def generate_csv(self) -> None:
        logger.debug('generating rag csv: %s', self.name)
        to_write = []
        for idx, book in enumerate(iter_books()):
            if len(to_write) >= self.max_lines:
                break
            if self.includes is not None:
                if self.includes not in book.raw_content.lower():
                    continue
            for sentence in book.sentences:
                if len(to_write) >= self.max_lines:
                    break
                if not sentence.strip():
                    continue
                for sub_sentence in sentence.split('\n'):
                    if not sub_sentence.strip():
                        continue
                    if len(sub_sentence.split()) < 2 or len(
                            sub_sentence.split()) > 30:
                        continue

                    if self.includes is not None:
                        if self.includes in sub_sentence.lower():
                            to_write.append({
                                'title': book.title,
                                'text': sub_sentence.strip(),
                            })
                    else:
                        to_write.append({
                            'title': book.title,
                            'text': sub_sentence.strip(),
                        })

        pandas.DataFrame(to_write).to_csv(self.csv_path, index=False)
コード例 #2
0
 def process(self, verbose=False) -> None:
     for idx, book in enumerate(iter_books()):
         # Maybe only update prev if update_groups did anything.
         # Doesn't really matter
         self.update_groups(book)
         if idx % self.log_rate == 0:
             self.log_stats(verbose=verbose)
コード例 #3
0
 def load_tagged_docs(self) -> None:
     # TODO: change this, it's a bit bookey rather than texty
     processed = 0
     for item in iter_books():
         if len(self.tagged_docs.items) >= self.num_items:
             break
         self.tagged_docs.load(item)
         processed += 1
コード例 #4
0
def get_wikisearch_people(stop_after=1000):
    people = {}
    with open(AUTHOR_METADATA_PATH, 'r') as f:
        for k, v in json.loads(f.read()).items():
            people[k] = clean_person(v)

    books = []
    for idx, book in enumerate(iter_books()):
        if idx == stop_after:
            break
        if book.author.name in people.keys():
            if people[book.author.name]['born'] is not None and people[
                    book.author.name]['nationality'] is not None:
                continue
            books.append(book)

    people_responses = []
    with ThreadPool(processes=20) as pool:
        people_responses.extend(pool.map(get_wikipedia_person_by_book, books))

    people.update({p[0]: p[1] for p in people_responses})

    return people
コード例 #5
0
ファイル: test_utils.py プロジェクト: RobertLucey/mauve
 def test_iter_books_goodreads(self):
     books = [b for b in iter_books()]
     self.assertEqual(len(books), 3)
     self.assertEqual(sorted([b.title for b in books]),
                      ['Title 1', 'Title 2', 'Title 3'])