def wikitext_one_sentence(left_context, nkjp_match, match_base_form): """ Take a tuple with the left and right side of the matched word and format it for printing. This is a way to circumvent doing wikilink('[[word]]'), which doesn't work properly as of 01/2015 Args: left_match_right (tuple): a tuple of three strings: the left side of the NKJP match, the match itself (in [[baseform|match]] form) and the right side match_base_form (str): base form of the queried word, for wikisation Returns: str: [[the|The]] [[input]] [[sentence]] [[format]]ted [[like]] [[this]]. """ left_ctx_wikised = wikilink(left_context) left_match_wikised = wikilink(NKJP_to_text(nkjp_match['lTks'])) final_sentence = left_ctx_wikised + left_match_wikised quote_count = final_sentence.count('"') try: last_left = nkjp_match['lTks'][-1].split('|')[0] except IndexError: last_left = None try: first_right = nkjp_match['rTks'][0].split('|')[0] except IndexError: first_right = None first_right_tag = '' else: first_right_tag = nkjp_match['rTks'][0].split('|')[2][2:] if ( last_left == 'w:"' and quote_count % 2 == 1)\ or last_left == 'w:(': pass else: final_sentence += ' ' final_sentence += shortLink(match_base_form, NKJP_to_text(nkjp_match['mTks'])) if (first_right == 'w:"' and quote_count % 2 == 1)\ or 'punct:interp' in first_right_tag: pass else: final_sentence += ' ' final_sentence += wikilink(NKJP_to_text(nkjp_match['rTks'])) return final_sentence.strip()
def wikitext_one_sentence(left_match_right, match_base_form): """ Take a tuple with the left and right side of the matched word and format it for printing. This is a way to circumvent doing wikilink('[[word]]'), which doesn't work properly as of 01/2015 Args: left_match_right (tuple): a tuple of three strings: the left side of the NKJP match, the match itself (in [[baseform|match]] form) and the right side match_base_form (str): base form of the queried word, for wikisation Returns: str: [[the|The]] [[input]] [[sentence]] [[format]]ted [[like]] [[this]]. """ re_whitespace_left = re.compile(r'(\s*?)$') re_whitespace_right = re.compile(r'^(\s*)') # https://regex101.com/r/yB6tQ8/6 re_punctuation_around = re.compile(r'^([\W]*?)(.+?)([\W]*?)$') whitespaces_left = re.search(re_whitespace_left, left_match_right[0]) whitespaces_right = re.search(re_whitespace_right, left_match_right[2]) punctuation_match = re.search(re_punctuation_around, left_match_right[1]) pretty_sentence = wikilink(left_match_right[0]) if whitespaces_left: pretty_sentence += whitespaces_left.group(1) if punctuation_match: pretty_sentence += punctuation_match.group(1) pretty_sentence += shortLink(match_base_form, punctuation_match.group(2)) pretty_sentence += punctuation_match.group(3) else: pretty_sentence += left_match_right[1] if whitespaces_right: pretty_sentence += whitespaces_right.group(1) pretty_sentence += wikilink(left_match_right[2]) prettier_sentence = phrases_wikilink(pretty_sentence) return prettier_sentence
def check_if_includes_orphan(sentence, orphan_list, excluded_orphans): re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])') allwords = re.findall(re_base_form, wikilink(sentence[0] + sentence[2])) for word in allwords: if ' się' in word: word = word[:-4] if '\n*{0}\n'.format(word) in orphan_list and word not in excluded_orphans: return word return None
def check_if_includes_orphan(sentence, orphan_list, excluded_orphans): re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])') allwords = re.findall(re_base_form, wikilink(sentence[0] + sentence[2])) for word in allwords: if ' się' in word: word = word[:-4] if '\n*{0}\n'.format( word) in orphan_list and word not in excluded_orphans: return word return None
def orphaned_examples(test_word=None, hashtable=None, online=False, complete_overwrite=False, onepage_testmode=False): buffer_size = 20 #how many words will be printed on one page if online: active_words = fetch_active_words( ) # prepare only as many pages as we need at the moment else: active_words = {'active': [], 'inactive': [], 'under_review': []} edit_history = read_edit_history() excluded_words = active_words['active'] + edit_history['added'] with open('output/empty_sections.txt', 'r') as g: empty_sections = g.readlines() random.shuffle(empty_sections) if not complete_overwrite: excluded_words += active_words['inactive'] else: excluded_words += active_words['under_review'] if not hashtable: authors_hashtable = read_author_hashtable() else: authors_hashtable = hashtable site = pwb.Site() # this is a dirty trick, because morfAnalyse() and wikilink() don't # really work as they should. The following regex extracts the first part # of [[these|links]] re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])') words_count = 0 with open('input/orphans.txt') as f,\ open('output/empty_sections.txt', 'r') as g: # list of pages with no examples (obtained by empty_section.py) orphans = f.read() # for testing purposes if test_word: empty_sections = [test_word] pages_count = 666 if onepage_testmode else 0 #loop helper output = [] #list-container for examples for input_word in empty_sections: if complete_overwrite == False and words_count > 2 * len( active_words['active']): with open('output/example_queue.json', 'w') as o: formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4) o.write(formatted_output) return 2 if (pages_count == 101) or (pages_count == 667 and onepage_testmode): return 0 # dealing with various list formats, e.g. *[[word]] input_word = input_word.strip('*[]\n') if len(input_word) < 4 or input_word.upper == input_word: continue if input_word in excluded_words: continue print(input_word) if complete_overwrite: # write to file/page every N words if len(output) == buffer_size: formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4) if online: while (True): output_page = pwb.Page( site, 'Wikisłownik:Dodawanie przykładów/dane/{0:03d}' .format(pages_count - 1)) if pages_count == 666 or output_page.userName( ) == 'AlkamidBot': output_page.text = formatted_output output_page.save( comment= 'Pobranie nowych przykładów z NKJP.pl') break else: pages_count += 1 if pages_count == 100: return 0 with open( 'output/json_examples_{0}.json'.format( pages_count), 'w') as o: o.write(formatted_output) pages_count += 1 output = [] if input_word[0] == '-' or input_word[-1] == '-' or input_word[ 0].isupper(): continue # let's skip prefixes and sufixes for now, also whatever starts with a capital leter query = '{0}**'.format(input_word).replace(' ', '** ') result = nkjp_lookup(query) root = etree.parse(result).getroot() #print(xml.dom.minidom.parseString(etree.tostring(root)).toprettyxml()) #return -1 if root.find('concordance') is not None: found = 0 found_orphan = 0 defs = get_definitions(input_word) if defs == 0: continue new_word = ExampleDict() new_word['title'] = input_word new_word['fetch_time'] = str(defs[1]) new_word['definitions'] = defs[0] for line in root.find('concordance').findall('line'): sentence = extract_one_sentence(line, input_word) # NKJP treats gerunds as verb forms. We don't if '\'\'czasownik' in new_word['definitions'] and\ all(('ger:' in analysed[2] or 'subst:' in analysed[2]) for analysed in morfeusz.analyse(sentence[1])[0]): continue if check_sentence_quality(sentence) == 0: continue ref = get_reference(line, authors_hashtable) if ref == '': break if len(new_word['examples']) < 2: temp_example = { 'verificator': 'None', 'correct_num': 'None', 'good_example': False, 'bad_example': False } #temp_example['left'] = line.find('left').text #temp_example['right'] = line.find('right').text temp_example['example'] = wikitext_one_sentence( sentence, input_word) temp_example['left_extra'] = phrases_wikilink( wikilink(sentence[3])) temp_example['right_extra'] = phrases_wikilink( wikilink(sentence[4])) temp_example['source'] = ref orphan_switch = check_if_includes_orphan( sentence, orphans, edit_history['orphans']) temp_example['orphan'] = orphan_switch new_word['examples'].append(temp_example) else: found_new = 0 wikified_example = wikitext_one_sentence( sentence, input_word) for ex_ix, ex in enumerate(new_word['examples']): neworphan = check_if_includes_orphan( sentence, orphans, edit_history['orphans']) if neworphan: if ex['orphan']: if wikified_proportion( ex['example'] ) < wikified_proportion(wikified_example): new_example = new_word['examples'][ ex_ix] found_new = 1 orphan_switch = neworphan break elif not orphan_switch: new_example = new_word['examples'][ex_ix] found_new = 1 break else: if not ex['orphan']: if wikified_proportion( ex['example'] ) < wikified_proportion(wikified_example): new_example = new_word['examples'][ ex_ix] found_new = 1 break if found_new: new_example['orphan'] = neworphan #new_example['left'] = line.find('left').text #new_example['right'] = line.find('right').text new_example['example'] = wikitext_one_sentence( sentence, input_word) new_example['left_extra'] = phrases_wikilink( wikilink(sentence[3])) new_example['right_extra'] = phrases_wikilink( wikilink(sentence[4])) new_example['source'] = ref if new_word and len(new_word['examples']) > 0: output.append(new_word) words_count += 1
def orphaned_examples(test_word=None, hashtable=None, online=False, complete_overwrite=False, onepage_testmode=False): buffer_size = 20 #how many words will be printed on one page if online: active_words = fetch_active_words() # prepare only as many pages as we need at the moment else: active_words = {'active': [], 'inactive': [], 'under_review': []} edit_history = read_edit_history() excluded_words = active_words['active'] + edit_history['added'] with open('output/empty_sections.txt', 'r') as g: empty_sections = g.readlines() random.shuffle(empty_sections) if not complete_overwrite: excluded_words += active_words['inactive'] else: excluded_words += active_words['under_review'] if not hashtable: authors_hashtable = read_author_hashtable() else: authors_hashtable = hashtable site = pwb.Site() # this is a dirty trick, because morfAnalyse() and wikilink() don't # really work as they should. The following regex extracts the first part # of [[these|links]] re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])') words_count = 0 with open('input/orphans.txt') as f,\ open('output/empty_sections.txt', 'r') as g: # list of pages with no examples (obtained by empty_section.py) orphans = f.read() # for testing purposes if test_word: empty_sections = [test_word] pages_count = 666 if onepage_testmode else 0 #loop helper output = [] #list-container for examples for input_word in empty_sections: if complete_overwrite == False and words_count > 2*len(active_words['active']): with open('output/example_queue.json', 'w') as o: formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4) o.write(formatted_output) return 2 if (pages_count == 101) or (pages_count == 667 and onepage_testmode): return 0 # dealing with various list formats, e.g. *[[word]] input_word = input_word.strip('*[]\n') if len(input_word) < 4 or input_word.upper == input_word: continue if input_word in excluded_words: continue print(input_word) if complete_overwrite: # write to file/page every N words if len(output) == buffer_size: formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4) if online: while(True): output_page = pwb.Page(site, 'Wikisłownik:Dodawanie przykładów/dane/{0:03d}'.format(pages_count-1)) if pages_count == 666 or output_page.userName() == 'AlkamidBot': output_page.text = formatted_output output_page.save(comment='Pobranie nowych przykładów z NKJP.pl') break else: pages_count += 1 if pages_count == 100: return 0 with open('output/json_examples_{0}.json'.format(pages_count), 'w') as o: o.write(formatted_output) pages_count += 1 output = [] if input_word[0] == '-' or input_word[-1] == '-' or input_word[0].isupper(): continue # let's skip prefixes and sufixes for now, also whatever starts with a capital leter query = '{0}**'.format(input_word).replace(' ', '** ') result = nkjp_lookup(query) root = etree.parse(result).getroot() #print(xml.dom.minidom.parseString(etree.tostring(root)).toprettyxml()) #return -1 if root.find('concordance') is not None: found = 0 found_orphan = 0 defs = get_definitions(input_word) if defs == 0: continue new_word = ExampleDict() new_word['title'] = input_word new_word['fetch_time'] = str(defs[1]) new_word['definitions'] = defs[0] for line in root.find('concordance').findall('line'): sentence = extract_one_sentence(line, input_word) # NKJP treats gerunds as verb forms. We don't if '\'\'czasownik' in new_word['definitions'] and\ all(('ger:' in analysed[2] or 'subst:' in analysed[2]) for analysed in morfeusz.analyse(sentence[1])[0]): continue if check_sentence_quality(sentence) == 0: continue ref = get_reference(line, authors_hashtable) if ref == '': break if len(new_word['examples']) < 2: temp_example = {'verificator': 'None', 'correct_num': 'None', 'good_example': False, 'bad_example': False} #temp_example['left'] = line.find('left').text #temp_example['right'] = line.find('right').text temp_example['example'] = wikitext_one_sentence(sentence, input_word) temp_example['left_extra'] = phrases_wikilink(wikilink(sentence[3])) temp_example['right_extra'] = phrases_wikilink(wikilink(sentence[4])) temp_example['source'] = ref orphan_switch = check_if_includes_orphan(sentence, orphans, edit_history['orphans']) temp_example['orphan'] = orphan_switch new_word['examples'].append(temp_example) else: found_new = 0 wikified_example = wikitext_one_sentence(sentence, input_word) for ex_ix, ex in enumerate(new_word['examples']): neworphan = check_if_includes_orphan(sentence, orphans, edit_history['orphans']) if neworphan: if ex['orphan']: if wikified_proportion(ex['example']) < wikified_proportion(wikified_example): new_example = new_word['examples'][ex_ix] found_new = 1 orphan_switch = neworphan break elif not orphan_switch: new_example = new_word['examples'][ex_ix] found_new = 1 break else: if not ex['orphan']: if wikified_proportion(ex['example']) < wikified_proportion(wikified_example): new_example = new_word['examples'][ex_ix] found_new = 1 break if found_new: new_example['orphan'] = neworphan #new_example['left'] = line.find('left').text #new_example['right'] = line.find('right').text new_example['example'] = wikitext_one_sentence(sentence, input_word) new_example['left_extra'] = phrases_wikilink(wikilink(sentence[3])) new_example['right_extra'] = phrases_wikilink(wikilink(sentence[4])) new_example['source'] = ref if new_word and len(new_word['examples']) > 0: output.append(new_word) words_count += 1