Ejemplo n.º 1
0
class WikiTagger:
  
  def __init__(self):
    self.wp = WiktionaryParser()
    self.wp.set_default_language('portuguese')

  def fetch(self, tokens):
    
    for x in tokens:
      if x.pos == 'TEMP': #or isinstance(x.pos,list):
        try:
          #parts = x.pos if isinstance(x.pos, list) else [] #persistencia
          parts = []
          p = self.wp.fetch(x.symbol)
          for k in p:
            for y in k['definitions']:
              gender = ''
              if 'm' in y['text'][0].split():
                gender = 'MASC'
              elif 'f' in y['text'][0].split():
                gender = 'FEMI'
              else:
                gender = 'DESC'
              coiso = PartOfSpeech(y['partOfSpeech'],gender, "TEMP")
              parts.append(coiso)
          if parts:
            x.pos = parts
          else:
            x.pos = [PartOfSpeech('proper noun','DESC','TEMP'),]
          #x.pos = wp.fetch(x.symbol)[0]['definitions'][0]['partOfSpeech']
        except Exception as e:
          print(e)
          x.pos = "ERROR"
    return tokens
Ejemplo n.º 2
0
def obtener_tipo_palabra(palabra):
    """
    Valida que la palabra exista. En caso afirmativo, retorna el tipo
    de palabra (Adjetivo, sustantivo o verbo). Caso negativo, retorna
    vacío.
    
    Args:
        palabra (str): Palabra a validar.

    Returns:
        Tipo (str): Tipo de palabra, siendo estos 'A' para Adjetivo,
                    'S' para Sustantivo y 'V' para Verbo.
                    En caso de que la palabra no exista, retorna vacío.
    """

    try:

        parser = WiktionaryParser()
        parser.set_default_language('spanish')

        word = parser.fetch(palabra)

        tipo_palabra = word[0]['definitions'][0]['partOfSpeech']

        if (tipo_palabra == 'adjective'):
            return (ADJETIVOS)
        elif (tipo_palabra == 'noun'):
            return (SUSTANTIVOS)
        elif (tipo_palabra == 'verb'):
            return (VERBOS)
        else:
            return ('')

    except IndexError:
        return ('')
def get_wiktionary_entry(language, word):
    """Interface to the requestion something from wiktionary.
    Arguments:
        language = language of which we want the entry
        word = word  of which we want the entry
    Returns:
        parsed wiktionary page
    """
    parser = WiktionaryParser()
    parser.set_default_language(language)
    return parser.fetch(word)
Ejemplo n.º 4
0
    def query(self, word):
        """A method to retrieve Wiktionary data online.

        A wrapper for `WiktionaryParser
        <https://www.github.com/Suyash458/WiktionaryParser>`_.

        Args:
            word (str): word to be queried.

        Returns:
            a list of Wiktionary data points

        Todo:
            Find the specific error that is thrown when too many
            requests are made in parallel.
        """

        #convert from language code to canonical name for Wiktionary parser
        language = lookup_language_by_code(self._language)

        #set language
        parser = WiktionaryParser()
        parser.set_default_language(language)

        # @functools.lru_cache(maxsize=config["cache_size"])
        def retrieve_wikidata(word, parser=parser, silent=True):
            """Helper for querying wiktionaryparser and rasing appropriate
            exceptions."""
            try:
                res = parser.fetch(word)
                return res
            except TypeError:
                if not silent:
                    print("Query '" + word +
                          "' failed. It is probably missing in "
                          "Wiktionary.")
                    return None
            except requests.exceptions.ConnectionError:
                print("Query '" + word + "' failed. LDT couldn't reach "
                      "Wiktionary. Your connection may be down or refused "
                      "by the server.")
                return None

        if not self.cache:
            return retrieve_wikidata(word)

        else:
            if not word in self.cache:
                return None
            else:
                return retrieve_wikidata(word)
Ejemplo n.º 5
0
class Wiktionary:

	def __init__(self, bot):
		self.bot = bot

		self.parser = WiktionaryParser()
		self.words = {}
		self.output = None
		self.embed = None

		self.parser.set_default_language('english')

	def __fetch_word(self, word):
		self.words = self.parser.fetch(word)

	@commands.group(brief='Gives you a word\'s etymology, definition, examples etc.')
	async def word(self, ctx):
		pass

	@word.command(brief='Changes the language the command will use.')
	async def lang(self, ctx, lang):
		self.parser.set_default_language(lang)
		language_list = 'https://en.wiktionary.org/wiki/Wiktionary:List_of_languages'
		await ctx.send(f'Language changed to {lang}.\nThe list of languages can be found here: {language_list}')

	@word.command(brief='Gives you a word\'s etymologies.', aliases=['e', 'ety'])
	async def etymology(self, ctx, word):
		self.__fetch_word(word)
		title = word
		description = f'{len(self.words)} results found.'
		self.embed = discord.Embed(color=ctx.message.author.color, title=title, description=description)
		for i, word in enumerate(self.words[:3], 1):
			self.embed.add_field(name=i, value=word['etymology'])
		await ctx.send(embed=self.embed)

	@word.command(brief='Gives you example usages for a word.', aliases=['ex'])
	async def example(self, ctx, word):
		self.__fetch_word(word)
		self.output = [str(word['definitions'][0]['examples']) for i, word in enumerate(self.words)][:3]
		print(self.output)
		await ctx.send('\n'.join(self.output))

	@word.command(brief='Gives you a word\'s definition.', aliases=['d', 'def'])
	async def definition(self, ctx, word):
		self.__fetch_word(word)
		self.output = [str(word['definitions'][0]['text']) for i, word in enumerate(self.words)][:3]
		print(self.output)
		await ctx.send('\n'.join(self.output))
def get_wiktionary_entry(language, word):
    """Interface to the requestion something from wiktionary.
    Arguments:
        language = language of which we want the entry
        word = word  of which we want the entry
    Returns:
        parsed wiktionary page
    """
    parser = WiktionaryParser()
    parser.set_default_language(language)
    try:
        return parser.fetch(word)
    except Exception as e:
        print("problem with word {}, language {}".format(word, language))
        print(e)
        return []
Ejemplo n.º 7
0
def new_word_db_fetch(words, wik_word_index=0, wik_nonword_index=0):
    # setup Wiktionary Parser
    wik_parser = WiktionaryParser()
    wik_parser.set_default_language('danish')
    wik_parser.RELATIONS = []
    wik_parser.PARTS_OF_SPEECH = [
        "noun", "verb", "adjective", "adverb", "proper noun"
    ]
    new_words = []
    new_nonwords = []
    for word in tqdm(words):
        try:
            data = wik_parser.fetch(word)
            if len(data) == 0:
                new_nonwords.append(word)
            else:
                new_words.append(word)
        except AttributeError:
            print("something went wrong, with fidning a word on WikWord.")
            continue
    csv_append('word_datasets/wik_nonwords.csv', new_nonwords,
               wik_nonword_index)
    csv_append('word_datasets/wik_words.csv', new_words, wik_word_index)
    return new_words, new_nonwords
Ejemplo n.º 8
0
def fetch_word(ui):
    parser = WiktionaryParser()
    parser.set_default_language('hungarian')
    input = ui.lineEdit.text()
    word = parser.fetch(input)
    return word
Ejemplo n.º 9
0
#!/usr/bin/env python3
from wiktionaryparser import WiktionaryParser
parser = WiktionaryParser()
word = parser.fetch('domus')
another_word = parser.fetch('domus', 'latin')
parser.set_default_language('french')
parser.exclude_part_of_speech('noun')
parser.include_relation('alternative forms')
import time
terms = set()
for line in open("data/manually_tagged_dictionary_eng.txt"):
    term = line.split()[0]
    terms.add(term)

for line in open("data/wiktionary_pronunciation_eng.txt"):
    term = line.split()[0]
    terms.remove(term)


import re
from wiktionaryparser import WiktionaryParser
parser = WiktionaryParser()
parser.set_default_language('english')

outfile = open("data/wiktionary_pronunciation_eng.txt","a")
for term in terms:
    word = parser.fetch(term)
    ps = []
    for w in word:
        phone = " ".join(w["pronunciations"]["text"])
        for ipa in re.findall("[/](\\w+)[/]",phone):
           ps.append(ipa)
    outfile.write(term+" "+",".join(ps)+"\n")
    outfile.flush()
    time.sleep(1)
Ejemplo n.º 11
0
def main(x):
    """Run administrative tasks."""

    parser = WiktionaryParser()
    parser.set_default_language('english')
    toDelete = list()
    toMap = dict()
    # QS = Dict.objects.all()
    QS = Dict.objects.filter(word__startswith=x)
    for qs in QS:
        if re.search('\W', qs.word):
            toDelete.append(qs.word)
            continue
        wiki = parser.fetch(qs.word)
        if len(wiki) == 0:
            toDelete.append(qs.word)

        # whether in patterns
        for i in range(len(wiki)):
            if not ('definitions' in wiki[i]):
                continue
            for j in range(len(wiki[i]['definitions'])):
                if not ('text' in wiki[i]['definitions'][j]):
                    continue
                texts = wiki[i]['definitions'][j]['text']
                for text in texts:
                    m = match(text, definition_patterns)
                    if m:
                        toMap[qs.word] = m.group(1)
                        print('definition: ' + qs.word)
                        continue
                    m = re.match('.*\sof\s(\w+)$', text)
                    if m:
                        print(text)
                        print(m.group(1))
                        pass
                    pass
                pass
            pass
        # whether + ly
        for i in range(len(wiki)):
            if not ('etymology' in wiki[i]):
                continue
            text = wiki[i]['etymology']
            m = match(text, etymology_patterns)
            if m:
                toMap[qs.word] = m.group(1)
                print('etymology: ' + qs.word)
                continue
        pass

    for keyword in toDelete:
        Dict.objects.filter(word=keyword).delete()

    for keyword in toMap:
        if len(Dict.objects.filter(word=keyword)) >= 1:
            if len(Dict.objects.filter(word=toMap[keyword])) >= 1:
                Dict.objects.filter(word=keyword).delete()
                map = DictMap()
                map.word = keyword
                map.origin = toMap[keyword]
                map.save()
                pass
        pass
Ejemplo n.º 12
0
    'es': [
        'https://fr.wiktionary.org/wiki/Cat%C3%A9gorie:Termes_vulgaires_en_espagnol'
    ],
    # (pas d'epitran pour l'Italien)   'it': ['https://fr.wiktionary.org/wiki/Cat%C3%A9gorie:Termes_vulgaires_en_italien'],
}

epitran_langs = {
    'en': 'eng-Latn',
    'fr': 'fra-Latn',
    'es': 'spa-Latn',
}

for lang in my_urls:

    parser = WiktionaryParser()
    parser.set_default_language(lang)
    parser.include_relation('alternative forms')
    epi = epitran.Epitran(epitran_langs[lang])

    # Préparation de la base de données
    import base

    urllst = my_urls[lang]
    for url in urllst:

        uClient = uReq(url)
        page_html = uClient.read()

        page_soup = soup(page_html, "html.parser")

        parser = WiktionaryParser()
class wiktionaryGraph(Base):

    def __init__(self,docpath,language='english'):

        super(wiktionaryGraph, self).__init__(docpath)

        self.parser = WiktionaryParser()
        self.parser.set_default_language(language)

        self.wiki_parse = defaultdict()
        self.pronounciations = defaultdict()
        self.definitions = defaultdict()
        self.etymologies = defaultdict()

        # still undecided whether the graph in general should have multiple links
        # or if it should be one link with multiple inputs.

        self.wikiGraph = nx.MultiGraph()

    def print_test(self,word='test'):
        test_word = self.parser.fetch(word)
        return test_word

    def get_corpus_wiktionary(self,instance='definitions'):

        for (id,document) in enumerate(self.corpus):
            for word in set(document.split()):
                try:
                    self.wiki_parse[id] = dict(word = self.parser.fetch(word))
                except:
                    pass

        return self.wiki_parse

    def get_document_wiktionary(self,doc_id,instance='definitions'):

        doc_def = defaultdict()
        for word in set(self.corpus[doc_id].split()):
            try:
                doc_def[word] = self.parser.fetch(word)
            except:
                pass

        return doc_def

    def build_wiktionary_graph(self,scores=None):

        if scores is None:
            scores = np.ones((len(self.wikiGraph.nodes()),), dtype=np.float)

        for document in self.corpus:
            for i, word in enumerate(document.split()):

                try:
                    next_word = document[i + 1]

                    if not self.wikiGraph.has_node(word):
                        self.wikiGraph.add_node(word)
                        self.wikiGraph.node[word]['count'] = scores[i]
                    else:
                        self.wikiGraph.node[word]['count'] += scores[i]

                    if not self.wikiGraph.has_node(next_word):
                        self.wikiGraph.add_node(next_word)
                        self.wikiGraph.node[next_word]['count'] = 0
                    if not self.wikiGraph.has_edge(word, next_word):
                        self.wikiGraph.add_edge(word, next_word, weight=maxsize - scores[i])
                    else:
                        self.wikiGraph.edge[word][next_word]['weight'] -= scores[i]

                except IndexError:
                    if not self.wikiGraph.has_node(word):
                        self.wikiGraph.add_node(word)
                        self.wikiGraph.node[word]['count'] = scores[i]
                    else:
                        self.wikiGraph.node[word]['count'] += scores[i]
                except:
                    raise
Ejemplo n.º 14
0
arg_parser = argparse.ArgumentParser(
    description=
    'Enter a processed "Passages_[my file].csv" file to play a generation game.'
)
arg_parser.add_argument(
    'data_folder',
    metavar='N',
    type=str,
    nargs='+',
    help=
    'a folder containing a Passages file with the following columns: Passage, Target, passage_no_target'
)

def_parser = WiktionaryParser()
def_parser.set_default_language('italian')

foldername = arg_parser.parse_args().data_folder[0]

#passage_df = pd.read_csv(foldername + '/data/Passages.csv', encoding="utf-8")
words_df = pd.read_csv(foldername + '/data/words.csv', encoding="utf-8")

total_points = 0

#program loop
while True:
    passage_num = np.random.randint(
        min(words_df.passage_num) + 1,
        max(words_df.passage_num) - 1)
    words_passage_df = words_df >> mask(X.passage_num == passage_num)
Ejemplo n.º 15
0
import time
from google.cloud import translate_v3beta1 as translate
from helpers import get_logger
import requests
from bs4 import BeautifulSoup
from wiktionaryparser import WiktionaryParser
import re
import os


os.environ['GOOGLE_APPLICATION_CREDENTIALS']='.google/default-ce-d2e59ab3fd13.json'
translate_client = translate.TranslationServiceClient()
logger = get_logger(__name__)
wiktionary_parser = WiktionaryParser()
wiktionary_parser.set_default_language('english')


def get_sentence(tag):
    return tag.get_text().replace("\n", "").replace("\r", "")


def extract_translation(tag):
    return ' '.join(tag.get_text().split(' ')[:-2])


def filter_sentence_tags(tags):
    for tag in tags:
        if len(tag.get_text().split(' ')) > 4:
            return tag