def collect_verbs(list_json=COOLJUGATOR_LIST_JSON):
    dump_utf_json([
        list(map(str.strip, verb.text.split('-', 1)))
        for verb in BeautifulSoup(
            requests.get('https://cooljugator.com/gr/list/all').content,
            'lxml').find_all('li', {'class': 'item'})
    ], list_json)
Ejemplo n.º 2
0
def scrape_booklist_from_file(target, src_txt='data/booklist_src.txt'):
    target_raw = target + '.json'
    target_html = target + '.txt'
    print(f"Comprising {target_raw} & {target_html} from {src_txt}...")
    with open(src_txt, 'rt') as handler:
        book_urls = list(map(str.strip, handler.readlines()))
    books = [scrape_book(book_url) for book_url in book_urls]
    dump_utf_json(books, target_raw)
    convert_to_html(target_raw, target_html)
def collect_do_transls():
    pattern = re.compile(r'\((.+?)\)')
    doed = load_utf_json(DO_JSON)
    transls = dict()
    count = counter(len(doed))
    for verb in doed:
        next(count)
        transls[verb] = pattern.findall(
            BeautifulSoup(
                requests.get('https://cooljugator.com/gr/' + verb).content,
                'lxml').find('span', id='mainform').text)[0]
    print()
    dump_utf_json(transls, COOLJUGATOR_DO_TRANSL)
Ejemplo n.º 4
0
 def launch(self):
     print("Launching @ {}...".format(self.post_url))
     self.get_target()
     self.scrape_post()
     self.scrape_pages()
     self.scrape_comments()
     post = {
         'url': self.post_url,
         'post': self.post,
         'comments': sorted(self.comments, key=lambda c: c['thread_url'])
     }
     if self.target_json:
         dump_utf_json(post, self.target_json)
     return post
def get_fieldnames(list_json=COOLJUGATOR_LIST_JSON,
                   fieldnames_json='cooljugator_fieldnames.json'):
    fieldnames = set()
    verbs = load_utf_json(list_json)
    count = counter(len(verbs))
    for verb, _ in verbs:
        next(count)
        for cell in BeautifulSoup(
                requests.get('https://cooljugator.com/gr/' + verb).content,
                'lxml').find_all('div', {'class': 'conjugation-cell'}):
            try:
                fieldnames.add(cell.attrs['id'])
            except KeyError:
                pass
    dump_utf_json(sorted(list(fieldnames)), fieldnames_json)
def collect_duplicates():
    visited = set()
    duplicates = set()
    coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS]
    count = counter(coll.count())
    for entry in coll.find():
        next(count)
        verbs = entry[VERB]
        if isinstance(verbs, str):
            verbs = [verbs]
        for verb in verbs:
            if verb in visited:
                duplicates.add(verb)
            else:
                visited.add(verb)
    print("\nDumping {} duplicates".format(len(duplicates)))
    dump_utf_json(sorted(list(duplicates)), DUPLICATES_JSON)
def collect_do():
    dump_utf_json(
        sorted([
            entry[VERB] for entry in MongoClient(LOCALHOST, PORT)[DB_NAME]
            [VERBS].find({TRANSL: 'do'})
        ]), DO_JSON)
Ejemplo n.º 8
0
 def launch(self):
     self.scroll_pages()
     self.books.sort(key=lambda b: b[0].split()[-1])
     print("Dumping {}...".format(self.target_raw))
     dump_utf_json(self.books, self.target_raw)
     convert_to_html(self.target_raw, self.target_html)