# coding=utf-8 from collections import Counter from plp import PLP p = PLP() def basic_form(word): ids = p.rec(word) return p.bform(ids[0]) if len(ids) > 0 else word def stats_sorted(stats): return sorted(stats, key=lambda x: x[1], reverse=True) def ranking(words): basic_forms = [basic_form(word) for word in words] stats = Counter(basic_forms) return stats_sorted(stats.items()), stats_sorted(stats.most_common(100))
BETA = 0.00002 def pre_process(s): return re.sub('[^\w\s]', '', s.lower(), flags=re.UNICODE) def strip_sie(form): if form.endswith(' się'): return form[:-len(' się')] return form if __name__ == '__main__': p = PLP() with open('data/stop_words.json', 'r') as f: stop_list = json.load(f) words_freq = {} total_no = 0 cooccurence_freq = {} associative_strength = {} if isfile('results/words_freq.json') and isfile('results/total_no.json') and isfile( 'results/' + STIMULUS + '_cooccurence_freq.json'): with open('results/words_freq.json', 'r') as f: words_freq = json.load(f) with open('results/total_no.json', 'r') as f: total_no = json.load(f) with open('results/' + STIMULUS + '_cooccurence_freq.json', 'r') as f:
import codecs from plp import PLP from stemmer import Stemmer __author__ = 'maciej' plp = PLP() plp._init() ile_poprawnych = 0 ile_wszystkich = 0 s = Stemmer(plp, filename='trie.bak', word_type=None) f = codecs.open('test.txt', 'r', 'utf-8') for line in f: ile_wszystkich += 1 parts = line.split(',') b_form = s.find_basic_form(parts[0]) if b_form.basic_form.strip() == parts[1].strip(): ile_poprawnych += 1 else: print b_form.basic_form, ';', parts[1], ';', parts[0] print 'Liczba poprawnie rozpoznanych: ', ile_poprawnych, '\nLiczba niepoprawnie rozpoznanych:', ile_wszystkich - ile_poprawnych
def getForms(bodziec): return map(lambda x: PLP().forms(x), PLP().orec(bodziec))[0]
# coding: utf-8 import codecs from collections import defaultdict from plp import PLP __author__ = "Michał Ciołczyk" _FILENAME = "data/odm.txt" _ENCODING = "windows-1250" _basic_forms = defaultdict(list) _initialized = False _plp = PLP() _SIE = ' się' def _load_flection_map(): global _initialized if not _initialized: with codecs.open(_FILENAME, 'r', encoding=_ENCODING) as f: for line in f: forms = line.rstrip('\n').split(', ') bform = forms[0] for form in forms: _basic_forms[form].append(bform) for form, bforms in _basic_forms.items(): _basic_forms[form] = list(set(bforms)) _initialized = True def _strip_sie(form):
#!/usr/bin/env python # encoding: utf-8 from plp import PLP p = PLP() VERB = PLP.CZESCI_MOWY.CZASOWNIK stimulus = u'fajka' st_forms = set(p.forms(p.rec(u'fajka')[0])) print st_forms snippets_count = 0 def parse_file(filename): global snippets_count with open(filename, 'r') as f: all_words = [] for line in f: words = line.strip().split() all_words.extend(words) stimulus_seen = False last_verb = None second_to_last_verb = None last_verb_index = 0 for i, word in enumerate(all_words): word_utf8 = word.decode('utf-8') if word_utf8 in st_forms or word_utf8[:-1] in st_forms: #print 'stimulus_seen'
def __init__(self): self.plp = PLP() self.plp._init() print 'Initialized plp' self.cities = TestPreparer().start()
class Test: """ Class responsible for running test against cities retrieved by TestPreparer """ trie_files = ['trie.bak', 'trie_only_nouns.bak', 'trie_nouns_and_adjectives.bak', 'trie_nouns_and_numerals.bak', 'trie_nouns_adjectives_and_numerals.bak'] # trie_files = ['trie.bak'] def __init__(self): self.plp = PLP() self.plp._init() print 'Initialized plp' self.cities = TestPreparer().start() # print 'Loaded cities: ', self.cities.__len__() def test(self): print 'Starting analysis' for trie_name in self.trie_files: print 'Starting', trie_name correct_number = 0 all_number = 0 s = Stemmer(self.plp, filename=trie_name, word_type=None) corrects_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/success_' + trie_name.replace('bak', 'txt'), 'w', 'utf-8') result_file = codecs.open('../wyniki/single_name/wies_miasto_kolonia_osada/' + trie_name.replace('bak', 'txt'), 'w', 'utf-8') result_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n') corrects_file.write(u'Dopełniacz;Mianownik;Wynik Stemmera\n') # for k, v in self.cities.iteritems(): cities = codecs.open('../data/cities_wies_miasto_kolonia_osada.csv', 'r', 'utf-8') for city in cities: k = city.split(';')[1].strip() v = city.split(';')[0].strip() all_number += 1 basic_form = '' # word_labels = [] # if k.__contains__('-'): # for city_parts in v.split('-'): # b = s.find_basic_form(city_parts) # basic_form += b.basic_form + '-' # word_labels.append(b.word_labels) # basic_form = basic_form[0:basic_form.__len__() - 1] # else: # for city_parts in v.split(' '): # b = s.find_basic_form(city_parts) # basic_form += b.basic_form + ' ' # word_labels.append(b.word_labels) basic_form = s.find_basic_form(v).basic_form.strip() if basic_form != k: # if basic_form == k: result_file.write(v + ';' + k + ';' + basic_form + ';') # for w_label in word_labels: # result_file.write(self.find_most_label(w_label) + ' ') result_file.write('\n') else: # corrects_file.write(v + ';' + k + ';' + basic_form + ';') # for label in s.find_labels(word_labels): # corrects_file.write(label + ' ') # corrects_file.write('\n') correct_number += 1 result_file.write(u'Liczba miejscowości;Liczba niepoprawnie rozpoznanych;Liczba poprawnie rozpoznanych\n') result_file.write( str(all_number) + ';' + str(all_number - correct_number) + ';' + str(correct_number)) print 'Done', trie_name def find_most_label(self, w_label): max_labels = dict() for word in w_label: for id in self.plp.rec(word): label = self.plp.label(id) if label in max_labels: max_labels[label] += 1 else: max_labels[label] = 1 return max(max_labels.iteritems(), key=operator.itemgetter(1))[0] def prepare_cities(self): print 'Preparing cities' res_file = codecs.open('../data/cities.csv', 'w', 'utf-8') res_file.write(u'Dopełniacz;Mianownik\n') for k, v in self.cities.iteritems(): res_file.write(v + ';' + k + '\n')
def setUp(self): self.plp = PLP('/usr/local/clp/lib/libclp_2.6.so')
class PLPTestCase(unittest.TestCase): def setUp(self): self.plp = PLP('/usr/local/clp/lib/libclp_2.6.so') def test_ver(self): self.assertIsInstance(self.plp.ver() , unicode) def test_rec(self): self.assertEqual(self.plp.rec(u'żółwiem'), [18660912]) def test_orec(self): self.assertEqual(self.plp.rec(u'zolwiem'), []) self.assertEqual(self.plp.orec(u'zolwiem'), [18660912]) def test_bform(self): self.assertEqual(self.plp.bform(18660912), u'żółw') def test_label(self): self.assertEqual(self.plp.label(18660912)[0], PLP.CZESCI_MOWY.RZECZOWNIK) self.assertEqual(self.plp.label(self.plp.rec(u'idę')[0])[0], PLP.CZESCI_MOWY.CZASOWNIK) def test_ogonkify(self): self.assertItemsEqual(self.plp.ogonkify(u'gzo'), [u'gzó', u'gżo', u'gźo', u'gźó', u'gżó']) def test_forms(self): self.assertEqual(self.plp.forms(17786048), [ u'pogoda', u'pogody', u'pogodzie', u'pogodę', u'pogodą', u'pogodo', u'pogód', u'pogodom', u'pogodami', u'pogodach' ]) def test_vec(self): self.assertEqual(self.plp.vec(18660912, u'żółwiem')[0], 5)