def scrape_comments(self): print("Scraping comments...") count = counter(len(self.thread_urls)) for thread_url in self.thread_urls: self.scrape_comment(thread_url) next(count) print("\nScraped {} comments. {} comments proved unavailable".format( len(self.comments), len(self.unavailable_comments))) for thread_url in self.unavailable_comments: print(thread_url)
def fix_do_transls(): coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS] transls = load_utf_json(DO_JSON).items() count = counter(len(transls)) for verb, transl in transls: next(count) match = coll.find({VERB: verb, TRANSL: 'do'})[0] match[TRANSL] = transl coll.save(match) print()
def filter_verbs(filtered_list_json, unfiltered_list_json=UNFILTERED_WIKILEXICO_LIST_JSON): print("Filtering {} --> {}".format(unfiltered_list_json, filtered_list_json)) count = counter() coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS] for verb in read_json_lines(unfiltered_list_json): next(count) if not coll.find({VERB: verb}).count(): yield verb print()
def collect_do_transls(): pattern = re.compile(r'\((.+?)\)') doed = load_utf_json(DO_JSON) transls = dict() count = counter(len(doed)) for verb in doed: next(count) transls[verb] = pattern.findall( BeautifulSoup( requests.get('https://cooljugator.com/gr/' + verb).content, 'lxml').find('span', id='mainform').text)[0] print() dump_utf_json(transls, COOLJUGATOR_DO_TRANSL)
def edit_field(fieldname, func, fltr=None, dbname=DB_NAME, collname=VERBS): if not fltr: fltr = dict() print("{}.{}: editing '{}' with '{}'...".format(dbname, collname, fieldname, func.__name__)) target = MongoClient(LOCALHOST, PORT)[dbname][collname] cursor = target.find(fltr) count = counter(cursor.count()) for entry in cursor: next(count) entry[fieldname] = func(entry.get(fieldname)) target.save(entry) print()
def get_fieldnames(list_json=COOLJUGATOR_LIST_JSON, fieldnames_json='cooljugator_fieldnames.json'): fieldnames = set() verbs = load_utf_json(list_json) count = counter(len(verbs)) for verb, _ in verbs: next(count) for cell in BeautifulSoup( requests.get('https://cooljugator.com/gr/' + verb).content, 'lxml').find_all('div', {'class': 'conjugation-cell'}): try: fieldnames.add(cell.attrs['id']) except KeyError: pass dump_utf_json(sorted(list(fieldnames)), fieldnames_json)
def remove_morphologically_abnormal_verbs(): abnormal_count = 0 coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS] count = counter(coll.count()) for entry in coll.find(): next(count) verbs = entry[VERB] if isinstance(verbs, str): verbs = [verbs] for verb in verbs: if not (verb.endswith('ω') or verb.endswith('ώ') or verb.endswith('αι')): coll.delete_one({VERB: verb}) abnormal_count += 1 print("\nRemoved {} abnormal verbs".format(abnormal_count))
def print_verbs(fieldname, fltr, func=None, dbname=DB_NAME, collname=VERBS): match = MongoClient(LOCALHOST, PORT)[dbname][collname].find(fltr) total = match.count() if func: res = list() count = counter(total) for entry in match: next(count) if func(entry): res.append(entry[fieldname]) print("\n{} matching items".format(len(res))) else: print(total, "matching entries") res = [entry[fieldname] for entry in match] for item in res: print(item)
def copy_collection(target_collname, dbname=DB_NAME, source_collname=VERBS, indices=(VERB, PARADIGM)): print("[{}]: copying [{}] to [{}]...".format(dbname, source_collname, target_collname)) assert target_collname != source_collname, "Collections should not have identical names" database = MongoClient(LOCALHOST, PORT)[dbname] target_coll = database[target_collname] target_coll.drop() source_coll = database[source_collname] count = counter(source_coll.count()) for entry in source_coll.find(): next(count) target_coll.insert(entry) add_indices(target_coll, indices)
def collect_active_voice_paradigms(raw_paradigm_json, list_json=FILTERED_WIKILEXICO_LIST_JSON): count = counter() for verb in read_json_lines(list_json): next(count) try: paradigm = get_paradigm(verb, active_voice=True) except Exception as e: print() print(verb) print() raise e if paradigm: entry = get_shortened(paradigm) entry[PARADIGM] = paradigm yield entry
def upload(source_json, source, db_name=DB_NAME, coll_name=VERBS, drop=False, indices=(VERB, PARADIGM)): target = MongoClient(LOCALHOST, PORT)[db_name][coll_name] if drop: target.drop() print('Initially,', target.count(), 'entries') count = counter() for line in read_json_lines(source_json): next(count) line[SOURCE] = source target.insert(line) add_indices(target, indices) print('\nCurrently,', target.count(), 'entries')
def collect_duplicates(): visited = set() duplicates = set() coll = MongoClient(LOCALHOST, PORT)[DB_NAME][VERBS] count = counter(coll.count()) for entry in coll.find(): next(count) verbs = entry[VERB] if isinstance(verbs, str): verbs = [verbs] for verb in verbs: if verb in visited: duplicates.add(verb) else: visited.add(verb) print("\nDumping {} duplicates".format(len(duplicates))) dump_utf_json(sorted(list(duplicates)), DUPLICATES_JSON)
def add_field(fieldname, fieldcontent, fltr=None, dbname=DB_NAME, collname=VERBS): if not fltr: fltr = dict() print("{}.{}: setting '{}' to \"{}\"...".format(dbname, collname, fieldname, fieldcontent)) target = MongoClient(LOCALHOST, PORT)[dbname][collname] cursor = target.find(fltr) count = counter(cursor.count()) for entry in cursor: next(count) entry[fieldname] = fieldcontent target.save(entry) print()
def collect_paradigms(raw_paradigm_json, list_json=COOLJUGATOR_LIST_JSON): verbs = load_utf_json(list_json) exceptions = dict() count = counter(len(verbs)) for verb, transl in verbs: next(count) paradigm, errors = get_paradigm(verb) paradigm.update({VERB: verb, TRANSL: transl}) if errors: exceptions[verb] = errors yield paradigm if exceptions: print('\n\nExceptions:') for exception in exceptions: print(exception) for item in exceptions[exception]: print(" {}".format(item)) print()
def collect_verbs(unfiltered_list_json): print("Collecting verbs to", unfiltered_list_json) page_url = 'https://el.wiktionary.org/w/index.php?title=%CE%9A%CE%B1%CF%84%CE%B7%CE%B3%CE%BF%CF%81%CE%AF%CE%B1:' \ '%CE%A1%CE%AE%CE%BC%CE%B1%CF%84%CE%B1_(%CE%BD%CE%AD%CE%B1_%CE%B5%CE%BB%CE%BB%CE%B7%CE%BD%CE%B9%CE%BA' \ '%CE%AC)&from=%CE%B1#mw-pages' count = counter() while True: soup = BeautifulSoup(requests.get(page_url).content, 'lxml') for a_tag in soup.find_all('a', href=True): a_text = a_tag.text if a_tag.get('title') == a_text: next(count) yield a_text try: page_url = 'https://el.wiktionary.org' + soup.find( 'a', href=True, text="επόμενη σελίδα")['href'] except TypeError: break print()