def collect_verbs(list_json=COOLJUGATOR_LIST_JSON): dump_utf_json([ list(map(str.strip, verb.text.split('-', 1))) for verb in BeautifulSoup( requests.get('https://cooljugator.com/gr/list/all').content, 'lxml').find_all('li', {'class': 'item'}) ], list_json)
def scrape_booklist_from_file(target, src_txt='data/booklist_src.txt'): target_raw = target + '.json' target_html = target + '.txt' print(f"Comprising {target_raw} & {target_html} from {src_txt}...") with open(src_txt, 'rt') as handler: book_urls = list(map(str.strip, handler.readlines())) books = [scrape_book(book_url) for book_url in book_urls] dump_utf_json(books, target_raw) convert_to_html(target_raw, target_html)
def collect_do_transls(): pattern = re.compile(r'\((.+?)\)') doed = load_utf_json(DO_JSON) transls = dict() count = counter(len(doed)) for verb in doed: next(count) transls[verb] = pattern.findall( BeautifulSoup( requests.get('https://cooljugator.com/gr/' + verb).content, 'lxml').find('span', id='mainform').text)[0] print() dump_utf_json(transls, COOLJUGATOR_DO_TRANSL)
def launch(self): print("Launching @ {}...".format(self.post_url)) self.get_target() self.scrape_post() self.scrape_pages() self.scrape_comments() post = { 'url': self.post_url, 'post': self.post, 'comments': sorted(self.comments, key=lambda c: c['thread_url']) } if self.target_json: dump_utf_json(post, self.target_json) return post
def get_fieldnames(list_json=COOLJUGATOR_LIST_JSON, fieldnames_json='cooljugator_fieldnames.json'): fieldnames = set() verbs = load_utf_json(list_json) count = counter(len(verbs)) for verb, _ in verbs: next(count) for cell in BeautifulSoup( requests.get('https://cooljugator.com/gr/' + verb).content, 'lxml').find_all('div', {'class': 'conjugation-cell'}): try: fieldnames.add(cell.attrs['id']) except KeyError: pass dump_utf_json(sorted(list(fieldnames)), fieldnames_json)
def collect_duplicates(): visited = set() duplicates = set() coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS] count = counter(coll.count()) for entry in coll.find(): next(count) verbs = entry[VERB] if isinstance(verbs, str): verbs = [verbs] for verb in verbs: if verb in visited: duplicates.add(verb) else: visited.add(verb) print("\nDumping {} duplicates".format(len(duplicates))) dump_utf_json(sorted(list(duplicates)), DUPLICATES_JSON)
def collect_do(): dump_utf_json( sorted([ entry[VERB] for entry in MongoClient(LOCALHOST, PORT)[DB_NAME] [VERBS].find({TRANSL: 'do'}) ]), DO_JSON)
def launch(self): self.scroll_pages() self.books.sort(key=lambda b: b[0].split()[-1]) print("Dumping {}...".format(self.target_raw)) dump_utf_json(self.books, self.target_raw) convert_to_html(self.target_raw, self.target_html)