class WikiTagger: def __init__(self): self.wp = WiktionaryParser() self.wp.set_default_language('portuguese') def fetch(self, tokens): for x in tokens: if x.pos == 'TEMP': #or isinstance(x.pos,list): try: #parts = x.pos if isinstance(x.pos, list) else [] #persistencia parts = [] p = self.wp.fetch(x.symbol) for k in p: for y in k['definitions']: gender = '' if 'm' in y['text'][0].split(): gender = 'MASC' elif 'f' in y['text'][0].split(): gender = 'FEMI' else: gender = 'DESC' coiso = PartOfSpeech(y['partOfSpeech'],gender, "TEMP") parts.append(coiso) if parts: x.pos = parts else: x.pos = [PartOfSpeech('proper noun','DESC','TEMP'),] #x.pos = wp.fetch(x.symbol)[0]['definitions'][0]['partOfSpeech'] except Exception as e: print(e) x.pos = "ERROR" return tokens
def obtener_tipo_palabra(palabra): """ Valida que la palabra exista. En caso afirmativo, retorna el tipo de palabra (Adjetivo, sustantivo o verbo). Caso negativo, retorna vacío. Args: palabra (str): Palabra a validar. Returns: Tipo (str): Tipo de palabra, siendo estos 'A' para Adjetivo, 'S' para Sustantivo y 'V' para Verbo. En caso de que la palabra no exista, retorna vacío. """ try: parser = WiktionaryParser() parser.set_default_language('spanish') word = parser.fetch(palabra) tipo_palabra = word[0]['definitions'][0]['partOfSpeech'] if (tipo_palabra == 'adjective'): return (ADJETIVOS) elif (tipo_palabra == 'noun'): return (SUSTANTIVOS) elif (tipo_palabra == 'verb'): return (VERBOS) else: return ('') except IndexError: return ('')
def get_wiktionary_entry(language, word): """Interface to the requestion something from wiktionary. Arguments: language = language of which we want the entry word = word of which we want the entry Returns: parsed wiktionary page """ parser = WiktionaryParser() parser.set_default_language(language) return parser.fetch(word)
def query(self, word): """A method to retrieve Wiktionary data online. A wrapper for `WiktionaryParser <https://www.github.com/Suyash458/WiktionaryParser>`_. Args: word (str): word to be queried. Returns: a list of Wiktionary data points Todo: Find the specific error that is thrown when too many requests are made in parallel. """ #convert from language code to canonical name for Wiktionary parser language = lookup_language_by_code(self._language) #set language parser = WiktionaryParser() parser.set_default_language(language) # @functools.lru_cache(maxsize=config["cache_size"]) def retrieve_wikidata(word, parser=parser, silent=True): """Helper for querying wiktionaryparser and rasing appropriate exceptions.""" try: res = parser.fetch(word) return res except TypeError: if not silent: print("Query '" + word + "' failed. It is probably missing in " "Wiktionary.") return None except requests.exceptions.ConnectionError: print("Query '" + word + "' failed. LDT couldn't reach " "Wiktionary. Your connection may be down or refused " "by the server.") return None if not self.cache: return retrieve_wikidata(word) else: if not word in self.cache: return None else: return retrieve_wikidata(word)
class Wiktionary: def __init__(self, bot): self.bot = bot self.parser = WiktionaryParser() self.words = {} self.output = None self.embed = None self.parser.set_default_language('english') def __fetch_word(self, word): self.words = self.parser.fetch(word) @commands.group(brief='Gives you a word\'s etymology, definition, examples etc.') async def word(self, ctx): pass @word.command(brief='Changes the language the command will use.') async def lang(self, ctx, lang): self.parser.set_default_language(lang) language_list = 'https://en.wiktionary.org/wiki/Wiktionary:List_of_languages' await ctx.send(f'Language changed to {lang}.\nThe list of languages can be found here: {language_list}') @word.command(brief='Gives you a word\'s etymologies.', aliases=['e', 'ety']) async def etymology(self, ctx, word): self.__fetch_word(word) title = word description = f'{len(self.words)} results found.' self.embed = discord.Embed(color=ctx.message.author.color, title=title, description=description) for i, word in enumerate(self.words[:3], 1): self.embed.add_field(name=i, value=word['etymology']) await ctx.send(embed=self.embed) @word.command(brief='Gives you example usages for a word.', aliases=['ex']) async def example(self, ctx, word): self.__fetch_word(word) self.output = [str(word['definitions'][0]['examples']) for i, word in enumerate(self.words)][:3] print(self.output) await ctx.send('\n'.join(self.output)) @word.command(brief='Gives you a word\'s definition.', aliases=['d', 'def']) async def definition(self, ctx, word): self.__fetch_word(word) self.output = [str(word['definitions'][0]['text']) for i, word in enumerate(self.words)][:3] print(self.output) await ctx.send('\n'.join(self.output))
def get_wiktionary_entry(language, word): """Interface to the requestion something from wiktionary. Arguments: language = language of which we want the entry word = word of which we want the entry Returns: parsed wiktionary page """ parser = WiktionaryParser() parser.set_default_language(language) try: return parser.fetch(word) except Exception as e: print("problem with word {}, language {}".format(word, language)) print(e) return []
def new_word_db_fetch(words, wik_word_index=0, wik_nonword_index=0): # setup Wiktionary Parser wik_parser = WiktionaryParser() wik_parser.set_default_language('danish') wik_parser.RELATIONS = [] wik_parser.PARTS_OF_SPEECH = [ "noun", "verb", "adjective", "adverb", "proper noun" ] new_words = [] new_nonwords = [] for word in tqdm(words): try: data = wik_parser.fetch(word) if len(data) == 0: new_nonwords.append(word) else: new_words.append(word) except AttributeError: print("something went wrong, with fidning a word on WikWord.") continue csv_append('word_datasets/wik_nonwords.csv', new_nonwords, wik_nonword_index) csv_append('word_datasets/wik_words.csv', new_words, wik_word_index) return new_words, new_nonwords
def fetch_word(ui): parser = WiktionaryParser() parser.set_default_language('hungarian') input = ui.lineEdit.text() word = parser.fetch(input) return word
#!/usr/bin/env python3 from wiktionaryparser import WiktionaryParser parser = WiktionaryParser() word = parser.fetch('domus') another_word = parser.fetch('domus', 'latin') parser.set_default_language('french') parser.exclude_part_of_speech('noun') parser.include_relation('alternative forms')
import time terms = set() for line in open("data/manually_tagged_dictionary_eng.txt"): term = line.split()[0] terms.add(term) for line in open("data/wiktionary_pronunciation_eng.txt"): term = line.split()[0] terms.remove(term) import re from wiktionaryparser import WiktionaryParser parser = WiktionaryParser() parser.set_default_language('english') outfile = open("data/wiktionary_pronunciation_eng.txt","a") for term in terms: word = parser.fetch(term) ps = [] for w in word: phone = " ".join(w["pronunciations"]["text"]) for ipa in re.findall("[/](\\w+)[/]",phone): ps.append(ipa) outfile.write(term+" "+",".join(ps)+"\n") outfile.flush() time.sleep(1)
def main(x): """Run administrative tasks.""" parser = WiktionaryParser() parser.set_default_language('english') toDelete = list() toMap = dict() # QS = Dict.objects.all() QS = Dict.objects.filter(word__startswith=x) for qs in QS: if re.search('\W', qs.word): toDelete.append(qs.word) continue wiki = parser.fetch(qs.word) if len(wiki) == 0: toDelete.append(qs.word) # whether in patterns for i in range(len(wiki)): if not ('definitions' in wiki[i]): continue for j in range(len(wiki[i]['definitions'])): if not ('text' in wiki[i]['definitions'][j]): continue texts = wiki[i]['definitions'][j]['text'] for text in texts: m = match(text, definition_patterns) if m: toMap[qs.word] = m.group(1) print('definition: ' + qs.word) continue m = re.match('.*\sof\s(\w+)$', text) if m: print(text) print(m.group(1)) pass pass pass pass # whether + ly for i in range(len(wiki)): if not ('etymology' in wiki[i]): continue text = wiki[i]['etymology'] m = match(text, etymology_patterns) if m: toMap[qs.word] = m.group(1) print('etymology: ' + qs.word) continue pass for keyword in toDelete: Dict.objects.filter(word=keyword).delete() for keyword in toMap: if len(Dict.objects.filter(word=keyword)) >= 1: if len(Dict.objects.filter(word=toMap[keyword])) >= 1: Dict.objects.filter(word=keyword).delete() map = DictMap() map.word = keyword map.origin = toMap[keyword] map.save() pass pass
'es': [ 'https://fr.wiktionary.org/wiki/Cat%C3%A9gorie:Termes_vulgaires_en_espagnol' ], # (pas d'epitran pour l'Italien) 'it': ['https://fr.wiktionary.org/wiki/Cat%C3%A9gorie:Termes_vulgaires_en_italien'], } epitran_langs = { 'en': 'eng-Latn', 'fr': 'fra-Latn', 'es': 'spa-Latn', } for lang in my_urls: parser = WiktionaryParser() parser.set_default_language(lang) parser.include_relation('alternative forms') epi = epitran.Epitran(epitran_langs[lang]) # Préparation de la base de données import base urllst = my_urls[lang] for url in urllst: uClient = uReq(url) page_html = uClient.read() page_soup = soup(page_html, "html.parser") parser = WiktionaryParser()
class wiktionaryGraph(Base): def __init__(self,docpath,language='english'): super(wiktionaryGraph, self).__init__(docpath) self.parser = WiktionaryParser() self.parser.set_default_language(language) self.wiki_parse = defaultdict() self.pronounciations = defaultdict() self.definitions = defaultdict() self.etymologies = defaultdict() # still undecided whether the graph in general should have multiple links # or if it should be one link with multiple inputs. self.wikiGraph = nx.MultiGraph() def print_test(self,word='test'): test_word = self.parser.fetch(word) return test_word def get_corpus_wiktionary(self,instance='definitions'): for (id,document) in enumerate(self.corpus): for word in set(document.split()): try: self.wiki_parse[id] = dict(word = self.parser.fetch(word)) except: pass return self.wiki_parse def get_document_wiktionary(self,doc_id,instance='definitions'): doc_def = defaultdict() for word in set(self.corpus[doc_id].split()): try: doc_def[word] = self.parser.fetch(word) except: pass return doc_def def build_wiktionary_graph(self,scores=None): if scores is None: scores = np.ones((len(self.wikiGraph.nodes()),), dtype=np.float) for document in self.corpus: for i, word in enumerate(document.split()): try: next_word = document[i + 1] if not self.wikiGraph.has_node(word): self.wikiGraph.add_node(word) self.wikiGraph.node[word]['count'] = scores[i] else: self.wikiGraph.node[word]['count'] += scores[i] if not self.wikiGraph.has_node(next_word): self.wikiGraph.add_node(next_word) self.wikiGraph.node[next_word]['count'] = 0 if not self.wikiGraph.has_edge(word, next_word): self.wikiGraph.add_edge(word, next_word, weight=maxsize - scores[i]) else: self.wikiGraph.edge[word][next_word]['weight'] -= scores[i] except IndexError: if not self.wikiGraph.has_node(word): self.wikiGraph.add_node(word) self.wikiGraph.node[word]['count'] = scores[i] else: self.wikiGraph.node[word]['count'] += scores[i] except: raise
arg_parser = argparse.ArgumentParser( description= 'Enter a processed "Passages_[my file].csv" file to play a generation game.' ) arg_parser.add_argument( 'data_folder', metavar='N', type=str, nargs='+', help= 'a folder containing a Passages file with the following columns: Passage, Target, passage_no_target' ) def_parser = WiktionaryParser() def_parser.set_default_language('italian') foldername = arg_parser.parse_args().data_folder[0] #passage_df = pd.read_csv(foldername + '/data/Passages.csv', encoding="utf-8") words_df = pd.read_csv(foldername + '/data/words.csv', encoding="utf-8") total_points = 0 #program loop while True: passage_num = np.random.randint( min(words_df.passage_num) + 1, max(words_df.passage_num) - 1) words_passage_df = words_df >> mask(X.passage_num == passage_num)
import time from google.cloud import translate_v3beta1 as translate from helpers import get_logger import requests from bs4 import BeautifulSoup from wiktionaryparser import WiktionaryParser import re import os os.environ['GOOGLE_APPLICATION_CREDENTIALS']='.google/default-ce-d2e59ab3fd13.json' translate_client = translate.TranslationServiceClient() logger = get_logger(__name__) wiktionary_parser = WiktionaryParser() wiktionary_parser.set_default_language('english') def get_sentence(tag): return tag.get_text().replace("\n", "").replace("\r", "") def extract_translation(tag): return ' '.join(tag.get_text().split(' ')[:-2]) def filter_sentence_tags(tags): for tag in tags: if len(tag.get_text().split(' ')) > 4: return tag