Ejemplo n.º 1
0
def get_words(txt):
    words = nltk.word_tokenize(txt)
    words_1 = []
    for w in words:
        if analyse(w)[0][0][1] != None:
            words_1.append(analyse(w)[0][0][1])
        else:
            words_1.append(w)
    words_1 = [w for w in words_1 if len(w) > 2]
    fdist = FreqDist(words_1)
    most = fdist.most_common(5)
    most = [m[0] for m in most]
    return most
Ejemplo n.º 2
0
 def test1(self):
     text = 'Mama ma.'
     interps = morfeusz.analyse(text)
     if sgjp:
         self.assertEqual(interps.pop(),
             [(u('Mama'), u('mama'), 'subst:sg:nom:f'), (u('ma'), u('mój'), 'adj:sg:voc:f:pos'), (u('.'), u('.'), 'interp')]
         )
     self.assertEqual(interps, [
         [(u('Mama'), u('mama'), 'subst:sg:nom:f'), (u('ma'), u('mieć'), 'fin:sg:ter:imperf'), (u('.'), u('.'), 'interp')],
         [(u('Mama'), u('mama'), 'subst:sg:nom:f'), (u('ma'), u('mój'), 'adj:sg:nom:f:pos'), (u('.'), u('.'), 'interp')]
     ])
Ejemplo n.º 3
0
 def test2(self):
     text = u('Miałem miał.')
     interps = morfeusz.analyse(text, dag=True)
     self.assertEqual(interps, [
         (0, 1, (u('Miał'), u('mieć'), u('praet:sg:m1:imperf'))),
         (0, 1, (u('Miał'), u('mieć'), u('praet:sg:m2:imperf'))),
         (0, 1, (u('Miał'), u('mieć'), u('praet:sg:m3:imperf'))),
         (1, 2, (u('em'), u('być'), u('aglt:sg:pri:imperf:wok'))),
         (0, 2, (u('Miałem'), u('miał'), u('subst:sg:inst:m3'))),
         (2, 3, (u('miał'), u('miał'), u('subst:sg:nom:m3'))),
         (2, 3, (u('miał'), u('miał'), u('subst:sg:acc:m3'))),
         (2, 3, (u('miał'), u('mieć'), u('praet:sg:m1:imperf'))),
         (2, 3, (u('miał'), u('mieć'), u('praet:sg:m2:imperf'))),
         (2, 3, (u('miał'), u('mieć'), u('praet:sg:m3:imperf'))),
         (3, 4, (u('.'), u('.'), u('interp'))),
     ])
Ejemplo n.º 4
0
def get_word_counts(text):
	words = defaultdict(int)
	tokens = nltk.wordpunct_tokenize(text)
	for word in tokens: # for every word in given text -> lemmatize & count
		word = re.sub(r'[_+=:;"\'\?/>.<,\\]',' ',word)
		if len(word) > 1:
			#print word.encode('utf8')
			res =  morfeusz.analyse(word,expand_tags=False,dag=True) # morfological analyzer for Polish
			try:
				base = res[0][2][1]
			except IndexError:
				base = None
			#list.append(tup[1])
			if base is not None:
				words[base] += 1 # increment the word count
			else:
				pass
	return words
Ejemplo n.º 5
0
def single_word_lemma(word):
    # analyzing sometimes fails
    try:
        interpretations = morfeusz.analyse(word)
    except:
        print('Parsing failed:', word, sys.exc_info(), file=sys.stderr)
        return word

    lemmas = set([i[0][1] for i in interpretations])

    if len(interpretations) == 1 and interpretations[0][0][2] == 'ign':
        # OOD - out of dictionary: maybe we can find this word in a different database
        # cases:
        # [inflected] acronyms
        # inflected names like Murphy'iego
        apos_agluts = "'a 'ego 'em 'er 'o 's 'u".split()
        lemma = interpretations[0][0][1]
        for ending in apos_agluts:
            if ending in lemma:
                return lemma.replace(ending, '')

    if word in lemmas:
        # print('Leaving:', word)
        return word

    if len(lemmas) == 1:
        lemma = list(lemmas)[0]
        # print('Single candidate:', lemma)
        splitted = lemma.split(
            ':')  # there are some flags in Morpheus, like Polska:s2
        return splitted[0].capitalize() if word[0].isupper() else splitted[0]

    # next step - cleans lemmas of markers, select only same-capitalization
    matching_lemmas = set(
        lemma.split(':')[0] for lemma in lemmas
        if lemma[0].isupper() == word[0].isupper())
    if len(matching_lemmas) == 1:
        return list(matching_lemmas)[0]

    # fallback
    return word
Ejemplo n.º 6
0
def orphaned_examples(test_word=None,
                      hashtable=None,
                      online=False,
                      complete_overwrite=False,
                      onepage_testmode=False):

    buffer_size = 20  #how many words will be printed on one page
    if online:
        active_words = fetch_active_words(
        )  # prepare only as many pages as we need at the moment
    else:
        active_words = {'active': [], 'inactive': [], 'under_review': []}

    edit_history = read_edit_history()
    excluded_words = active_words['active'] + edit_history['added']

    with open('output/empty_sections.txt', 'r') as g:
        empty_sections = g.readlines()
        random.shuffle(empty_sections)

    if not complete_overwrite:
        excluded_words += active_words['inactive']
    else:
        excluded_words += active_words['under_review']

    if not hashtable:
        authors_hashtable = read_author_hashtable()
    else:
        authors_hashtable = hashtable

    site = pwb.Site()

    # this is a dirty trick, because morfAnalyse() and wikilink() don't
    # really work as they should. The following regex extracts the first part
    # of [[these|links]]
    re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])')

    words_count = 0
    with open('input/orphans.txt') as f,\
    open('output/empty_sections.txt', 'r') as g:

        # list of pages with no examples (obtained by empty_section.py)
        orphans = f.read()

        # for testing purposes
        if test_word:
            empty_sections = [test_word]

        pages_count = 666 if onepage_testmode else 0  #loop helper
        output = []  #list-container for examples

        for input_word in empty_sections:

            if complete_overwrite == False and words_count > 2 * len(
                    active_words['active']):
                with open('output/example_queue.json', 'w') as o:
                    formatted_output = json.dumps(ordermydict(output),
                                                  ensure_ascii=False,
                                                  indent=4)
                    o.write(formatted_output)
                return 2

            if (pages_count == 101) or (pages_count == 667
                                        and onepage_testmode):
                return 0

            # dealing with various list formats, e.g. *[[word]]
            input_word = input_word.strip('*[]\n')
            if len(input_word) < 4 or input_word.upper == input_word:
                continue

            if input_word in excluded_words:
                continue

            print(input_word)

            if complete_overwrite:
                # write to file/page every N words
                if len(output) == buffer_size:
                    formatted_output = json.dumps(ordermydict(output),
                                                  ensure_ascii=False,
                                                  indent=4)

                    if online:
                        while (True):
                            output_page = pwb.Page(
                                site,
                                'Wikisłownik:Dodawanie przykładów/dane/{0:03d}'
                                .format(pages_count - 1))
                            if pages_count == 666 or output_page.userName(
                            ) == 'AlkamidBot':
                                output_page.text = formatted_output
                                output_page.save(
                                    comment=
                                    'Pobranie nowych przykładów z NKJP.pl')
                                break
                            else:
                                pages_count += 1
                                if pages_count == 100:
                                    return 0

                    with open(
                            'output/json_examples_{0}.json'.format(
                                pages_count), 'w') as o:
                        o.write(formatted_output)
                        pages_count += 1
                        output = []

            if input_word[0] == '-' or input_word[-1] == '-' or input_word[
                    0].isupper():
                continue  # let's skip prefixes and sufixes for now, also whatever starts with a capital leter

            query = '{0}**'.format(input_word).replace(' ', '** ')
            result = nkjp_lookup(query)
            root = etree.parse(result).getroot()

            #print(xml.dom.minidom.parseString(etree.tostring(root)).toprettyxml())
            #return -1
            if root.find('concordance') is not None:
                found = 0
                found_orphan = 0

                defs = get_definitions(input_word)
                if defs == 0:
                    continue

                new_word = ExampleDict()
                new_word['title'] = input_word
                new_word['fetch_time'] = str(defs[1])
                new_word['definitions'] = defs[0]

                for line in root.find('concordance').findall('line'):

                    sentence = extract_one_sentence(line, input_word)

                    # NKJP treats gerunds as verb forms. We don't
                    if '\'\'czasownik' in new_word['definitions'] and\
                       all(('ger:' in analysed[2] or 'subst:' in analysed[2]) for analysed in morfeusz.analyse(sentence[1])[0]):
                        continue

                    if check_sentence_quality(sentence) == 0:
                        continue

                    ref = get_reference(line, authors_hashtable)
                    if ref == '':
                        break

                    if len(new_word['examples']) < 2:
                        temp_example = {
                            'verificator': 'None',
                            'correct_num': 'None',
                            'good_example': False,
                            'bad_example': False
                        }
                        #temp_example['left'] = line.find('left').text
                        #temp_example['right'] = line.find('right').text
                        temp_example['example'] = wikitext_one_sentence(
                            sentence, input_word)
                        temp_example['left_extra'] = phrases_wikilink(
                            wikilink(sentence[3]))
                        temp_example['right_extra'] = phrases_wikilink(
                            wikilink(sentence[4]))
                        temp_example['source'] = ref

                        orphan_switch = check_if_includes_orphan(
                            sentence, orphans, edit_history['orphans'])
                        temp_example['orphan'] = orphan_switch
                        new_word['examples'].append(temp_example)

                    else:

                        found_new = 0
                        wikified_example = wikitext_one_sentence(
                            sentence, input_word)

                        for ex_ix, ex in enumerate(new_word['examples']):
                            neworphan = check_if_includes_orphan(
                                sentence, orphans, edit_history['orphans'])
                            if neworphan:
                                if ex['orphan']:
                                    if wikified_proportion(
                                            ex['example']
                                    ) < wikified_proportion(wikified_example):
                                        new_example = new_word['examples'][
                                            ex_ix]
                                        found_new = 1
                                        orphan_switch = neworphan
                                        break
                                elif not orphan_switch:
                                    new_example = new_word['examples'][ex_ix]
                                    found_new = 1
                                    break
                            else:
                                if not ex['orphan']:
                                    if wikified_proportion(
                                            ex['example']
                                    ) < wikified_proportion(wikified_example):
                                        new_example = new_word['examples'][
                                            ex_ix]
                                        found_new = 1
                                        break

                        if found_new:
                            new_example['orphan'] = neworphan
                            #new_example['left'] = line.find('left').text
                            #new_example['right'] = line.find('right').text
                            new_example['example'] = wikitext_one_sentence(
                                sentence, input_word)
                            new_example['left_extra'] = phrases_wikilink(
                                wikilink(sentence[3]))
                            new_example['right_extra'] = phrases_wikilink(
                                wikilink(sentence[4]))
                            new_example['source'] = ref

                if new_word and len(new_word['examples']) > 0:
                    output.append(new_word)
                    words_count += 1
Ejemplo n.º 7
0
def orphaned_examples(test_word=None, hashtable=None, online=False, complete_overwrite=False, onepage_testmode=False):

    buffer_size = 20 #how many words will be printed on one page
    if online:
        active_words = fetch_active_words() # prepare only as many pages as we need at the moment
    else:
        active_words = {'active': [], 'inactive': [], 'under_review': []}

    edit_history = read_edit_history()
    excluded_words =  active_words['active'] + edit_history['added']

    with open('output/empty_sections.txt', 'r') as g:
        empty_sections = g.readlines()
        random.shuffle(empty_sections)

    if not complete_overwrite:
        excluded_words += active_words['inactive']
    else:
        excluded_words += active_words['under_review']
    
    if not hashtable:
        authors_hashtable = read_author_hashtable()
    else:
        authors_hashtable = hashtable

    site = pwb.Site()

    # this is a dirty trick, because morfAnalyse() and wikilink() don't
    # really work as they should. The following regex extracts the first part
    # of [[these|links]]
    re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])')


    words_count = 0
    with open('input/orphans.txt') as f,\
    open('output/empty_sections.txt', 'r') as g:

        # list of pages with no examples (obtained by empty_section.py)
        orphans = f.read()
        
        # for testing purposes
        if test_word:
            empty_sections = [test_word]

        pages_count = 666 if onepage_testmode else 0 #loop helper
        output = [] #list-container for examples

        for input_word in empty_sections:
            
            if complete_overwrite == False and words_count > 2*len(active_words['active']):
                with open('output/example_queue.json', 'w') as o:
                    formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4)
                    o.write(formatted_output)
                return 2
            
            if (pages_count == 101) or (pages_count == 667 and onepage_testmode):
                return 0

            # dealing with various list formats, e.g. *[[word]]
            input_word = input_word.strip('*[]\n')
            if len(input_word) < 4 or input_word.upper == input_word:
                continue

            if input_word in excluded_words:
                continue

            print(input_word)

            if complete_overwrite:
            # write to file/page every N words
                if len(output) == buffer_size:
                    formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4)

                    if online:                        
                        while(True):
                            output_page = pwb.Page(site, 'Wikisłownik:Dodawanie przykładów/dane/{0:03d}'.format(pages_count-1))
                            if pages_count == 666 or output_page.userName() == 'AlkamidBot':
                                output_page.text = formatted_output
                                output_page.save(comment='Pobranie nowych przykładów z NKJP.pl')
                                break
                            else:
                                pages_count += 1
                                if pages_count == 100:
                                    return 0
                            

                    with open('output/json_examples_{0}.json'.format(pages_count), 'w') as o:
                        o.write(formatted_output)
                        pages_count += 1
                        output = []


            if input_word[0] == '-' or input_word[-1] == '-' or input_word[0].isupper():
                continue # let's skip prefixes and sufixes for now, also whatever starts with a capital leter

            query = '{0}**'.format(input_word).replace(' ', '** ')
            result = nkjp_lookup(query)
            root = etree.parse(result).getroot()

            #print(xml.dom.minidom.parseString(etree.tostring(root)).toprettyxml())
            #return -1
            if root.find('concordance') is not None:
                found = 0
                found_orphan = 0

                defs = get_definitions(input_word)
                if defs == 0:
                    continue

                new_word = ExampleDict()
                new_word['title'] = input_word
                new_word['fetch_time'] = str(defs[1])
                new_word['definitions'] = defs[0]

                for line in root.find('concordance').findall('line'):

                    sentence = extract_one_sentence(line, input_word)

                    # NKJP treats gerunds as verb forms. We don't
                    if '\'\'czasownik' in new_word['definitions'] and\
                       all(('ger:' in analysed[2] or 'subst:' in analysed[2]) for analysed in morfeusz.analyse(sentence[1])[0]):
                        continue


                    if check_sentence_quality(sentence) == 0:
                        continue

                    ref = get_reference(line, authors_hashtable)
                    if ref == '':
                        break

                    if len(new_word['examples']) < 2:
                        temp_example = {'verificator': 'None', 'correct_num': 'None', 'good_example': False, 'bad_example': False}
                        #temp_example['left'] = line.find('left').text
                        #temp_example['right'] = line.find('right').text
                        temp_example['example'] = wikitext_one_sentence(sentence, input_word)
                        temp_example['left_extra'] = phrases_wikilink(wikilink(sentence[3]))
                        temp_example['right_extra'] = phrases_wikilink(wikilink(sentence[4]))
                        temp_example['source'] = ref

                        orphan_switch = check_if_includes_orphan(sentence, orphans, edit_history['orphans'])
                        temp_example['orphan'] = orphan_switch
                        new_word['examples'].append(temp_example)

                    else:
                        
                        found_new = 0
                        wikified_example = wikitext_one_sentence(sentence, input_word)

                        for ex_ix, ex in enumerate(new_word['examples']):
                            neworphan = check_if_includes_orphan(sentence, orphans, edit_history['orphans'])
                            if neworphan:
                                if ex['orphan']:
                                    if wikified_proportion(ex['example']) < wikified_proportion(wikified_example):
                                        new_example = new_word['examples'][ex_ix]
                                        found_new = 1
                                        orphan_switch = neworphan
                                        break
                                elif not orphan_switch:
                                    new_example = new_word['examples'][ex_ix]
                                    found_new = 1
                                    break
                            else:
                                if not ex['orphan']:
                                    if wikified_proportion(ex['example']) < wikified_proportion(wikified_example):
                                        new_example = new_word['examples'][ex_ix]
                                        found_new = 1
                                        break
                            
                        if found_new:
                            new_example['orphan'] = neworphan
                            #new_example['left'] = line.find('left').text
                            #new_example['right'] = line.find('right').text
                            new_example['example'] = wikitext_one_sentence(sentence, input_word)
                            new_example['left_extra'] = phrases_wikilink(wikilink(sentence[3]))
                            new_example['right_extra'] = phrases_wikilink(wikilink(sentence[4]))
                            new_example['source'] = ref

                if new_word and len(new_word['examples']) > 0:
                    output.append(new_word)
                    words_count += 1
Ejemplo n.º 8
0
    # else every exception is "during handling of OSError..."
    restart = True

if restart:
    # keyboard interrupts jump out here if raised in wrapper
    env['LD_LIBRARY_PATH'] = os.path.realpath(os.path.dirname(__file__))
    print('Restarting with LD_LIBRARY_PATH =', env['LD_LIBRARY_PATH'])
    subprocess.call([sys.executable] + sys.argv, env=env)
    exit()

parser = argparse.ArgumentParser()
parser.add_argument('--dag', action='store_true', help='use dag=True with morfeusz.analyze()')
args = parser.parse_args()

while True:
    try:
        data = input('morfeusz> ')
    except EOFError:
        print()
        break
    except KeyboardInterrupt:
        print()
        break

    if not data:
        break

    pprint.pprint(morfeusz.analyse(data, dag=args.dag))

print('bye')