def main(): if argv[2] == 'all' and len(argv) > 3: wc_f = open(argv[3]) wikicodes = [wc.strip() for wc in wc_f] else: wikicodes = [wc.strip() for wc in argv[2:]] cfg_fn = argv[1] logger = logging.getLogger('wikt2dict') for wc in wikicodes: try: print 'Parsing ' + wc + 'wiktionary' wiktionary = None #FIXME error handling workaround wiktionary = Wiktionary(wc, cfg_fn) logger.info('%s Wiktionary object built', wiktionary.cfg['fullname']) wiktionary.parse_all_articles() logger.info('%s Wiktionary articles parsed', wiktionary.cfg['fullname']) print ' Extracted {0} pairs'.format(len(wiktionary.article_parser.pairs)) wiktionary.write_pairs() logger.info('%s Wiktionary translations written to file', wiktionary.cfg['fullname']) except NotImplementedError as e: if wiktionary and wiktionary.cfg and wiktionary.cfg['fullname']: logger.error('%s Wiktionary unrecognized parser type', wiktionary.cfg['fullname']) else: logger.error('%s Wiktionary unrecognized parser type', wc) continue except AttributeError as e: print e continue except Exception as e: print wc, str(e) continue
def main(): if argv[2] == 'all' and len(argv) > 3: wc_f = open(argv[3]) wikicodes = [wc.strip() for wc in wc_f] else: wikicodes = [wc.strip() for wc in argv[2:]] cfg_fn = argv[1] logger = logging.getLogger('wikt2dict') for wc in wikicodes: print wc try: wiktionary = Wiktionary(wc, cfg_fn) logger.info('%s Wiktionary object built', wiktionary.cfg['fullname']) wiktionary.parse_all_articles() logger.info('%s Wiktionary articles parsed', wiktionary.cfg['fullname']) wiktionary.write_pairs() logger.info('%s Wiktionary translations written to file', wiktionary.cfg['fullname']) except AttributeError: continue except Exception as e: print wc, str(e) continue
def main(): if argv[2] == 'all' and len(argv) > 3: wc_f = open(argv[3]) wikicodes = [wc.strip() for wc in wc_f] else: wikicodes = [wc.strip() for wc in argv[2:]] cfg_fn = argv[1] logger = logging.getLogger('wikt2dict') for wc in wikicodes: try: print 'Parsing ' + wc + 'wiktionary' wiktionary = None #FIXME error handling workaround wiktionary = Wiktionary(wc, cfg_fn) logger.info('%s Wiktionary object built', wiktionary.cfg['fullname']) wiktionary.parse_all_articles() logger.info('%s Wiktionary articles parsed', wiktionary.cfg['fullname']) print ' Extracted {0} pairs'.format( len(wiktionary.article_parser.pairs)) wiktionary.write_pairs() logger.info('%s Wiktionary translations written to file', wiktionary.cfg['fullname']) except NotImplementedError as e: if wiktionary and wiktionary.cfg and wiktionary.cfg['fullname']: logger.error('%s Wiktionary unrecognized parser type', wiktionary.cfg['fullname']) else: logger.error('%s Wiktionary unrecognized parser type', wc) continue except AttributeError as e: print e continue except Exception as e: print wc, str(e) continue
import pytest from wiktionary import Wiktionary, Declensions wiktionary = Wiktionary("tests/hestur.xml") page = wiktionary.get_by_title("hestur") entry = next(page.get_entries()) def test_database_discovers_templates(): assert wiktionary.get_declension_template("kk sb 01") is not None def test_entry_title(): assert entry.name == "hestur" def test_entry_declension_arguments(): assert entry.declension_arguments[0] == "hest" assert entry.declension_arguments[1] == "ur" def test_entry_part_of_speech(): assert entry.category == "nafnorð" def test_entry_part_of_speech(): assert entry.part_of_speech == "Karlkynsnafnorð" def test_entry_is_icelandic():
import click from database import db as sqldb from wiktionary import Wiktionary, Declensions from frequencies import Frequencies from models import Form, Lemma, Translation, MODELS wikitionary = Wiktionary("articles.xml") frequencies = Frequencies("frequency.csv") d = Declensions(wikitionary) sqldb.drop_tables(MODELS) sqldb.create_tables(MODELS) known_failures = [ 'Mið-Afríkulýðveldið', 'mar', 'endurnýjanleg orka', 'Garðabær' ] with open("failures.txt", "w") as out: failures = [] count = 0 with click.progressbar(wikitionary.pages, label="populating") as pages: for page in pages: for entry in page.get_entries(): try: if entry.name in known_failures: continue if not entry.is_icelandic: continue
from wiktionary import Wiktionary, Declensions word = "matseðill" db = Wiktionary("articles.xml") d = Declensions(db) page = db.get_by_title(word) entries = list(page.get_entries()) for entry in entries: print(entry.to_dict()) declensions = d.get_declensions(word)