import sys sys.path.insert( 0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml/wiktionary_parser' ) sys.path.insert( 0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml' ) from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.fr.page import frPage from wiktionary_parser.languages.fr.parseText import FrParseText xml_file = open( '../../../../../datasets/sense_disambiguation_datasets/frwiktionary-20161101-pages-articles-multistream.xml' ) xml_parser = XMLPageParser(xml_file, frPage) french_words = set(['sauter']) #for title, page in xml_parser.from_titles(german_words): # found_words.add(title) for page in xml_parser.from_titles(french_words): #print page.text parseData = FrParseText(page.text) print 'Title', page.title #parseData.view_sections() #print parseData.synonyms print parseData.verb_translations break
# -*- coding: utf-8 -*- """ This example extracts a number of words from the wiktionary xml file. """ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.de.page import dePage xml_file = open('../../wiktionary_data/dewiktionary-20110504-pages-articles.xml') xml_parser = XMLPageParser(xml_file, dePage) german_words = set([u'Bank', u'Kiefer']) found_words = set([]) for title, page in xml_parser.from_titles(german_words): found_words.add(title) page.parse() for word in page.words: print('') print(word.title) print('******************') if word.bedeutungen: print('--Bedeutungen---------------') print(word.bedeutungen) if word.beispiele: print('--Beispiele-----------------') print(word.beispiele) if word.gender: print('--Gender--------------------') print(word.gender)
# -*- coding: utf-8 -*- """ This example extracts a number of words from the simple.wiktionary xml file. """ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.simple.page import simplePage xml_file = open('../../wiktionary_data/simplewiktionary-20110514-pages-articles.xml') xml_parser = XMLPageParser(xml_file, simplePage) # The words we want to extract wanted_words = set([u'fish']) found_words = set([]) for title, page in xml_parser.from_titles(wanted_words): page.parse() # Print out a summary of the want for word in page.words: print word.summary() found_words.add(title) if wanted_words == found_words: break
# -*- coding: utf-8 -*- """ This example extracts a number of words from the simple.wiktionary xml file. """ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.simple.page import simplePage xml_file = open( '../../wiktionary_data/simplewiktionary-20110514-pages-articles.xml') xml_parser = XMLPageParser(xml_file, simplePage) # The words we want to extract wanted_words = set([u'fish']) found_words = set([]) for title, page in xml_parser.from_titles(wanted_words): page.parse() # Print out a summary of the want for word in page.words: print word.summary() found_words.add(title) if wanted_words == found_words: break
""" This example extracts a number of words from the wiktionary xml file. """ from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.de.page import dePage xml_file = open( '../../wiktionary_data/dewiktionary-20110504-pages-articles.xml') xml_parser = XMLPageParser(xml_file, dePage) german_words = set([u'Bank', u'Kiefer']) found_words = set([]) for title, page in xml_parser.from_titles(german_words): found_words.add(title) page.parse() for word in page.words: print('') print(word.title) print('******************') if word.bedeutungen: print('--Bedeutungen---------------') print(word.bedeutungen) if word.beispiele: print('--Beispiele-----------------') print(word.beispiele) if word.gender: print('--Gender--------------------') print(word.gender)
# -*- coding: utf-8 -*- """ This example extracts a number of words from the wiktionary xml file. """ import sys sys.path.insert(0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml/wiktionary_parser') sys.path.insert(0, '/disk/scratch/s1146856/project_codes/tools/sense_stuff/wiktionary-parser-xml') from wiktionary_parser.xml_parser import XMLPageParser from wiktionary_parser.languages.nl.page import nlPage from wiktionary_parser.languages.nl.parseText import NlParseText xml_file = open('../../../../../datasets/sense_disambiguation_datasets/nlwiktionary-20161120-pages-articles-multistream.xml') xml_parser = XMLPageParser(xml_file, nlPage) dutch_words = set(['springen']) #for title, page in xml_parser.from_titles(german_words): # found_words.add(title) for page in xml_parser.from_titles(dutch_words): print page.text parseData = NlParseText(page.text) print 'Title', page.title #parseData.view_sections() #print parseData.synonyms print parseData.verb_translations break