コード例 #1
0
def wikitext_one_sentence(left_match_right, match_base_form):
    """
    Take a tuple with the left and right side of the matched word
    and format it for printing. This is a way to circumvent doing
    wikilink('[[word]]'), which doesn't work properly as of 01/2015

    Args:
        left_match_right (tuple): a tuple of three strings: the left side
            of the NKJP match, the match itself (in [[baseform|match]] form)
            and the right side
        match_base_form (str): base form of the queried word, for wikisation

    Returns:
        str: [[the|The]] [[input]] [[sentence]] [[format]]ted [[like]] [[this]].
    """

    re_whitespace_left = re.compile(r'(\s*?)$')
    re_whitespace_right = re.compile(r'^(\s*)')

    # https://regex101.com/r/yB6tQ8/6
    re_punctuation_around = re.compile(r'^([\W]*?)(.+?)([\W]*?)$')

    whitespaces_left = re.search(re_whitespace_left, left_match_right[0])
    whitespaces_right = re.search(re_whitespace_right, left_match_right[2])
    punctuation_match = re.search(re_punctuation_around, left_match_right[1])

    pretty_sentence = wikilink(left_match_right[0])

    if whitespaces_left:
        pretty_sentence += whitespaces_left.group(1)

    if punctuation_match:
        pretty_sentence += punctuation_match.group(1)
        pretty_sentence += shortLink(match_base_form,
                                     punctuation_match.group(2))
        pretty_sentence += punctuation_match.group(3)
    else:
        pretty_sentence += left_match_right[1]

    if whitespaces_right:
        pretty_sentence += whitespaces_right.group(1)
    pretty_sentence += wikilink(left_match_right[2])
    prettier_sentence = phrases_wikilink(pretty_sentence)

    return prettier_sentence
コード例 #2
0
def wikitext_one_sentence(left_match_right, match_base_form):
    """
    Take a tuple with the left and right side of the matched word
    and format it for printing. This is a way to circumvent doing
    wikilink('[[word]]'), which doesn't work properly as of 01/2015

    Args:
        left_match_right (tuple): a tuple of three strings: the left side
            of the NKJP match, the match itself (in [[baseform|match]] form)
            and the right side
        match_base_form (str): base form of the queried word, for wikisation

    Returns:
        str: [[the|The]] [[input]] [[sentence]] [[format]]ted [[like]] [[this]].
    """

    re_whitespace_left = re.compile(r'(\s*?)$')
    re_whitespace_right = re.compile(r'^(\s*)')

    # https://regex101.com/r/yB6tQ8/6
    re_punctuation_around = re.compile(r'^([\W]*?)(.+?)([\W]*?)$')

    whitespaces_left = re.search(re_whitespace_left, left_match_right[0])
    whitespaces_right = re.search(re_whitespace_right, left_match_right[2])
    punctuation_match = re.search(re_punctuation_around, left_match_right[1])

    pretty_sentence = wikilink(left_match_right[0])

    if whitespaces_left:
        pretty_sentence += whitespaces_left.group(1)

    if punctuation_match:
        pretty_sentence += punctuation_match.group(1)
        pretty_sentence += shortLink(match_base_form, punctuation_match.group(2))
        pretty_sentence += punctuation_match.group(3)
    else:
        pretty_sentence += left_match_right[1]

    if whitespaces_right:
        pretty_sentence += whitespaces_right.group(1)
    pretty_sentence += wikilink(left_match_right[2])
    prettier_sentence = phrases_wikilink(pretty_sentence)

    return prettier_sentence
コード例 #3
0
def orphaned_examples(test_word=None,
                      hashtable=None,
                      online=False,
                      complete_overwrite=False,
                      onepage_testmode=False):

    buffer_size = 20  #how many words will be printed on one page
    if online:
        active_words = fetch_active_words(
        )  # prepare only as many pages as we need at the moment
    else:
        active_words = {'active': [], 'inactive': [], 'under_review': []}

    edit_history = read_edit_history()
    excluded_words = active_words['active'] + edit_history['added']

    with open('output/empty_sections.txt', 'r') as g:
        empty_sections = g.readlines()
        random.shuffle(empty_sections)

    if not complete_overwrite:
        excluded_words += active_words['inactive']
    else:
        excluded_words += active_words['under_review']

    if not hashtable:
        authors_hashtable = read_author_hashtable()
    else:
        authors_hashtable = hashtable

    site = pwb.Site()

    # this is a dirty trick, because morfAnalyse() and wikilink() don't
    # really work as they should. The following regex extracts the first part
    # of [[these|links]]
    re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])')

    words_count = 0
    with open('input/orphans.txt') as f,\
    open('output/empty_sections.txt', 'r') as g:

        # list of pages with no examples (obtained by empty_section.py)
        orphans = f.read()

        # for testing purposes
        if test_word:
            empty_sections = [test_word]

        pages_count = 666 if onepage_testmode else 0  #loop helper
        output = []  #list-container for examples

        for input_word in empty_sections:

            if complete_overwrite == False and words_count > 2 * len(
                    active_words['active']):
                with open('output/example_queue.json', 'w') as o:
                    formatted_output = json.dumps(ordermydict(output),
                                                  ensure_ascii=False,
                                                  indent=4)
                    o.write(formatted_output)
                return 2

            if (pages_count == 101) or (pages_count == 667
                                        and onepage_testmode):
                return 0

            # dealing with various list formats, e.g. *[[word]]
            input_word = input_word.strip('*[]\n')
            if len(input_word) < 4 or input_word.upper == input_word:
                continue

            if input_word in excluded_words:
                continue

            print(input_word)

            if complete_overwrite:
                # write to file/page every N words
                if len(output) == buffer_size:
                    formatted_output = json.dumps(ordermydict(output),
                                                  ensure_ascii=False,
                                                  indent=4)

                    if online:
                        while (True):
                            output_page = pwb.Page(
                                site,
                                'Wikisłownik:Dodawanie przykładów/dane/{0:03d}'
                                .format(pages_count - 1))
                            if pages_count == 666 or output_page.userName(
                            ) == 'AlkamidBot':
                                output_page.text = formatted_output
                                output_page.save(
                                    comment=
                                    'Pobranie nowych przykładów z NKJP.pl')
                                break
                            else:
                                pages_count += 1
                                if pages_count == 100:
                                    return 0

                    with open(
                            'output/json_examples_{0}.json'.format(
                                pages_count), 'w') as o:
                        o.write(formatted_output)
                        pages_count += 1
                        output = []

            if input_word[0] == '-' or input_word[-1] == '-' or input_word[
                    0].isupper():
                continue  # let's skip prefixes and sufixes for now, also whatever starts with a capital leter

            query = '{0}**'.format(input_word).replace(' ', '** ')
            result = nkjp_lookup(query)
            root = etree.parse(result).getroot()

            #print(xml.dom.minidom.parseString(etree.tostring(root)).toprettyxml())
            #return -1
            if root.find('concordance') is not None:
                found = 0
                found_orphan = 0

                defs = get_definitions(input_word)
                if defs == 0:
                    continue

                new_word = ExampleDict()
                new_word['title'] = input_word
                new_word['fetch_time'] = str(defs[1])
                new_word['definitions'] = defs[0]

                for line in root.find('concordance').findall('line'):

                    sentence = extract_one_sentence(line, input_word)

                    # NKJP treats gerunds as verb forms. We don't
                    if '\'\'czasownik' in new_word['definitions'] and\
                       all(('ger:' in analysed[2] or 'subst:' in analysed[2]) for analysed in morfeusz.analyse(sentence[1])[0]):
                        continue

                    if check_sentence_quality(sentence) == 0:
                        continue

                    ref = get_reference(line, authors_hashtable)
                    if ref == '':
                        break

                    if len(new_word['examples']) < 2:
                        temp_example = {
                            'verificator': 'None',
                            'correct_num': 'None',
                            'good_example': False,
                            'bad_example': False
                        }
                        #temp_example['left'] = line.find('left').text
                        #temp_example['right'] = line.find('right').text
                        temp_example['example'] = wikitext_one_sentence(
                            sentence, input_word)
                        temp_example['left_extra'] = phrases_wikilink(
                            wikilink(sentence[3]))
                        temp_example['right_extra'] = phrases_wikilink(
                            wikilink(sentence[4]))
                        temp_example['source'] = ref

                        orphan_switch = check_if_includes_orphan(
                            sentence, orphans, edit_history['orphans'])
                        temp_example['orphan'] = orphan_switch
                        new_word['examples'].append(temp_example)

                    else:

                        found_new = 0
                        wikified_example = wikitext_one_sentence(
                            sentence, input_word)

                        for ex_ix, ex in enumerate(new_word['examples']):
                            neworphan = check_if_includes_orphan(
                                sentence, orphans, edit_history['orphans'])
                            if neworphan:
                                if ex['orphan']:
                                    if wikified_proportion(
                                            ex['example']
                                    ) < wikified_proportion(wikified_example):
                                        new_example = new_word['examples'][
                                            ex_ix]
                                        found_new = 1
                                        orphan_switch = neworphan
                                        break
                                elif not orphan_switch:
                                    new_example = new_word['examples'][ex_ix]
                                    found_new = 1
                                    break
                            else:
                                if not ex['orphan']:
                                    if wikified_proportion(
                                            ex['example']
                                    ) < wikified_proportion(wikified_example):
                                        new_example = new_word['examples'][
                                            ex_ix]
                                        found_new = 1
                                        break

                        if found_new:
                            new_example['orphan'] = neworphan
                            #new_example['left'] = line.find('left').text
                            #new_example['right'] = line.find('right').text
                            new_example['example'] = wikitext_one_sentence(
                                sentence, input_word)
                            new_example['left_extra'] = phrases_wikilink(
                                wikilink(sentence[3]))
                            new_example['right_extra'] = phrases_wikilink(
                                wikilink(sentence[4]))
                            new_example['source'] = ref

                if new_word and len(new_word['examples']) > 0:
                    output.append(new_word)
                    words_count += 1
コード例 #4
0
def orphaned_examples(test_word=None, hashtable=None, online=False, complete_overwrite=False, onepage_testmode=False):

    buffer_size = 20 #how many words will be printed on one page
    if online:
        active_words = fetch_active_words() # prepare only as many pages as we need at the moment
    else:
        active_words = {'active': [], 'inactive': [], 'under_review': []}

    edit_history = read_edit_history()
    excluded_words =  active_words['active'] + edit_history['added']

    with open('output/empty_sections.txt', 'r') as g:
        empty_sections = g.readlines()
        random.shuffle(empty_sections)

    if not complete_overwrite:
        excluded_words += active_words['inactive']
    else:
        excluded_words += active_words['under_review']
    
    if not hashtable:
        authors_hashtable = read_author_hashtable()
    else:
        authors_hashtable = hashtable

    site = pwb.Site()

    # this is a dirty trick, because morfAnalyse() and wikilink() don't
    # really work as they should. The following regex extracts the first part
    # of [[these|links]]
    re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])')


    words_count = 0
    with open('input/orphans.txt') as f,\
    open('output/empty_sections.txt', 'r') as g:

        # list of pages with no examples (obtained by empty_section.py)
        orphans = f.read()
        
        # for testing purposes
        if test_word:
            empty_sections = [test_word]

        pages_count = 666 if onepage_testmode else 0 #loop helper
        output = [] #list-container for examples

        for input_word in empty_sections:
            
            if complete_overwrite == False and words_count > 2*len(active_words['active']):
                with open('output/example_queue.json', 'w') as o:
                    formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4)
                    o.write(formatted_output)
                return 2
            
            if (pages_count == 101) or (pages_count == 667 and onepage_testmode):
                return 0

            # dealing with various list formats, e.g. *[[word]]
            input_word = input_word.strip('*[]\n')
            if len(input_word) < 4 or input_word.upper == input_word:
                continue

            if input_word in excluded_words:
                continue

            print(input_word)

            if complete_overwrite:
            # write to file/page every N words
                if len(output) == buffer_size:
                    formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4)

                    if online:                        
                        while(True):
                            output_page = pwb.Page(site, 'Wikisłownik:Dodawanie przykładów/dane/{0:03d}'.format(pages_count-1))
                            if pages_count == 666 or output_page.userName() == 'AlkamidBot':
                                output_page.text = formatted_output
                                output_page.save(comment='Pobranie nowych przykładów z NKJP.pl')
                                break
                            else:
                                pages_count += 1
                                if pages_count == 100:
                                    return 0
                            

                    with open('output/json_examples_{0}.json'.format(pages_count), 'w') as o:
                        o.write(formatted_output)
                        pages_count += 1
                        output = []


            if input_word[0] == '-' or input_word[-1] == '-' or input_word[0].isupper():
                continue # let's skip prefixes and sufixes for now, also whatever starts with a capital leter

            query = '{0}**'.format(input_word).replace(' ', '** ')
            result = nkjp_lookup(query)
            root = etree.parse(result).getroot()

            #print(xml.dom.minidom.parseString(etree.tostring(root)).toprettyxml())
            #return -1
            if root.find('concordance') is not None:
                found = 0
                found_orphan = 0

                defs = get_definitions(input_word)
                if defs == 0:
                    continue

                new_word = ExampleDict()
                new_word['title'] = input_word
                new_word['fetch_time'] = str(defs[1])
                new_word['definitions'] = defs[0]

                for line in root.find('concordance').findall('line'):

                    sentence = extract_one_sentence(line, input_word)

                    # NKJP treats gerunds as verb forms. We don't
                    if '\'\'czasownik' in new_word['definitions'] and\
                       all(('ger:' in analysed[2] or 'subst:' in analysed[2]) for analysed in morfeusz.analyse(sentence[1])[0]):
                        continue


                    if check_sentence_quality(sentence) == 0:
                        continue

                    ref = get_reference(line, authors_hashtable)
                    if ref == '':
                        break

                    if len(new_word['examples']) < 2:
                        temp_example = {'verificator': 'None', 'correct_num': 'None', 'good_example': False, 'bad_example': False}
                        #temp_example['left'] = line.find('left').text
                        #temp_example['right'] = line.find('right').text
                        temp_example['example'] = wikitext_one_sentence(sentence, input_word)
                        temp_example['left_extra'] = phrases_wikilink(wikilink(sentence[3]))
                        temp_example['right_extra'] = phrases_wikilink(wikilink(sentence[4]))
                        temp_example['source'] = ref

                        orphan_switch = check_if_includes_orphan(sentence, orphans, edit_history['orphans'])
                        temp_example['orphan'] = orphan_switch
                        new_word['examples'].append(temp_example)

                    else:
                        
                        found_new = 0
                        wikified_example = wikitext_one_sentence(sentence, input_word)

                        for ex_ix, ex in enumerate(new_word['examples']):
                            neworphan = check_if_includes_orphan(sentence, orphans, edit_history['orphans'])
                            if neworphan:
                                if ex['orphan']:
                                    if wikified_proportion(ex['example']) < wikified_proportion(wikified_example):
                                        new_example = new_word['examples'][ex_ix]
                                        found_new = 1
                                        orphan_switch = neworphan
                                        break
                                elif not orphan_switch:
                                    new_example = new_word['examples'][ex_ix]
                                    found_new = 1
                                    break
                            else:
                                if not ex['orphan']:
                                    if wikified_proportion(ex['example']) < wikified_proportion(wikified_example):
                                        new_example = new_word['examples'][ex_ix]
                                        found_new = 1
                                        break
                            
                        if found_new:
                            new_example['orphan'] = neworphan
                            #new_example['left'] = line.find('left').text
                            #new_example['right'] = line.find('right').text
                            new_example['example'] = wikitext_one_sentence(sentence, input_word)
                            new_example['left_extra'] = phrases_wikilink(wikilink(sentence[3]))
                            new_example['right_extra'] = phrases_wikilink(wikilink(sentence[4]))
                            new_example['source'] = ref

                if new_word and len(new_word['examples']) > 0:
                    output.append(new_word)
                    words_count += 1