@TextScraper._needs_download def plural(self): if 'NN' in self.pos(): content = self.tree.xpath( '//div[@class="grad733100"]/h2[@class="inline"]' )[0].text_content() info, content = '', content.strip('I ') article, word = content.split(' ')[0], ''.join( content.split(' ')[1:]) if self.tree.xpath('//div[@class="grad733100"]/table'): info = self.tree.xpath('//div[@class="grad733100"]/table' )[0].text_content().encode('latin-1') if re.search('-(\w+) \(meerv.\)', info, re.U): # Suffix is provided suffix = re.findall('-(\w+) \(meerv.\)', info, re.U)[0].strip() return [word + suffix] elif re.search('([\w|\s]+) \(meerv.\)', info, re.U): # Plural form is provided result = re.findall('([\w|\s]+) \(meerv.\)', info, re.U)[0].strip() return [result] else: # There is no plural return [''] return [None] register(MijnWoordenBoekNl)
def pos(self, element=None): ''' Tries to decide about the part of speech. ''' tags = [] if element: if re.search('[\w|\s]+ [m|f]\.', element, re.U): tags.append('NN') if '[VERB]' in element: tags.append('VB') if 'adj.' in element and re.search('([\w|\s]+, [\w|\s]+)', element, re.U): tags.append('JJ') else: for element in self.elements: if element.startswith(self.word): tags += self.pos(element) return list(set(tags)) @DictScraper._needs_elements def gender(self): ''' Tries to scrape the gender for a given noun from leo.org. ''' element = self._first('NN') if element: if re.search('([m|f|n)])\.', element, re.U): genus = re.findall('([m|f|n)])\.', element, re.U)[0] return genus register(LeoIt)
#!/usr/bin/python # -*- coding: UTF-8 -*- import requests from lxml import html import re from lltk.scrapers import Verbix from lltk.scraping import register class VerbixDe(Verbix): def __init__(self, word): super(VerbixDe, self).__init__(word, 'de') self.tenses['Future I'] = 'Future I' self.tenses['Future II'] = 'Future II' def _normalize(self, string): ''' Returns a sanitized string. ''' string = super(VerbixDe, self)._normalize(string) string = string.replace('sie; Sie', 'sie') string = string.strip() return string register(VerbixDe)
#!/usr/bin/python # -*- coding: UTF-8 -*- import requests from lxml import html import re from lltk.scrapers import Verbix from lltk.scraping import register class VerbixNl(Verbix): def __init__(self, word): super(VerbixNl, self).__init__(word, 'nl') self.tenses['Perfect'] = 'Present Perfect' self.tenses['Pluperfect'] = 'Past Perfect' self.tenses['Future II'] = 'Future Perfect' register(VerbixNl)
@DictScraper._needs_elements def pos(self, element = None): ''' Tries to decide about the part of speech. ''' tags = [] if element: if re.search('[\w|\s]+ [m|f]\.', element, re.U): tags.append('NN') if '[VERB]' in element: tags.append('VB') if 'adj.' in element and re.search('([\w|\s]+, [\w|\s]+)', element, re.U): tags.append('JJ') else: for element in self.elements: if element.startswith(self.word): tags += self.pos(element) return list(set(tags)) @DictScraper._needs_elements def gender(self): ''' Tries to scrape the gender for a given noun from leo.org. ''' element = self._first('NN') if element: if re.search('([m|f|n)])\.', element, re.U): genus = re.findall('([m|f|n)])\.', element, re.U)[0] return genus register(LeoIt)
@DictScraper._needs_elements def plural(self): ''' Tries to scrape the plural version from uitmuntend.nl. ''' element = self._first('NN') if element: element = element.split('\r\n')[0] if ' | ' in element: # This means there is a plural singular, plural = element.split(' | ') return [plural.split(' ')[1]] else: # This means there is no plural return [''] return [None] @DictScraper._needs_elements def gender(self): ''' Tries to scrape the gender for a given noun from uitmuntend.nl. ''' element = self._first('NN') if element: element = element.split('\r\n')[0] if re.search(r' \[([m|f])\]', element, re.U): genus = re.findall(r' \[([m|f])\]', element, re.U)[0] return genus return 'n' register(UitmuntendNl)
return result @DictScraper._needs_elements def plural(self): ''' Tries to scrape the plural version from vandale.nl. ''' element = self._first('NN') if element: if re.search('meervoud: ([\w|\s|\'|\-|,]+)', element, re.U): results = re.search('meervoud: ([\w|\s|\'|\-|,]+)', element, re.U).groups()[0].split(', ') results = [x.replace('ook ', '').strip() for x in results] return results else: # There is no plural form return [''] return [None] @DictScraper._needs_elements def miniaturize(self): ''' Tries to scrape the miniaturized version from vandale.nl. ''' element = self._first('NN') if element: if re.search('verkleinwoord: (\w+)', element, re.U): return re.findall('verkleinwoord: (\w+)', element, re.U) else: return [''] return [None] register(VandaleNl)
genus = re.findall(' ([m|f]) ', content)[0] return genus # @TextScraper._needs_download # def articles(self): # result = [None, None] # if self.pos() == 'NN': # if self.tree.xpath('//table[contains(@class, "wikitable")]/tr'): # content = self.tree.xpath('//table[contains(@class, "wikitable")]/tr')[1].text_content() # singular, plural = content.split('\n')[1:3] # if singular.startswith(('der ', 'die ', 'das ')): # result[0] = singular.split(' ')[0] # if plural.startswith(('der ', 'die ', 'das ')): # result[1] = plural.split(' ')[0] # return result # @TextScraper._needs_download # def plural(self): # if self.pos() == 'NN': # if self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'): # content = self._normalize(self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]')[0].getnext().text_content()) # result = re.findall('Plural[\d|\s]*: ([\w|\s]+)', content, re.U) # result = [x.strip() for x in result] # return result # return [None] register(WiktionaryIt)
import requests from lxml import html import re from lltk.scrapers import Verbix from lltk.scraping import register class VerbixFr(Verbix): def __init__(self, word): super(VerbixFr, self).__init__(word, 'fr') self.tenses['Present'] = 'Présent' self.tenses['Past'] = 'Imparfait' self.tenses['Perfect'] = 'Passé composé' self.tenses['Pluperfect'] = 'Plus-que-parfait' self.tenses['Future I'] = 'Futur simple' self.tenses['Future II'] = 'Futur antérieur' def _normalize(self, string): ''' Returns a sanitized string. ''' string = super(VerbixFr, self)._normalize(string) string = string.replace('il; elle', 'il/elle') string = string.replace('ils; elles', 'ils/elles') string = string.strip() return string register(VerbixFr)
@DictScraper._needs_elements def pos(self, element = None): ''' Tries to decide about the part of speech. ''' tags = [] if element: if re.findall('\w+ {[m|f]}', element, re.U): tags.append('NN') if re.search('\w+ \[[\w|\|]+\]', element, re.U): tags.append('VB') if '{agg.}' in element: tags.append('JJ') else: for element in self.elements: if self.word in unicode(element): return self.pos(element) return tags @DictScraper._needs_elements def gender(self): ''' Tries to scrape the gender for a given noun from babl.la. ''' element = self._first('NN') if element: if re.search('{([m|f|n)])}', element, re.U): genus = re.findall('{([m|f|n)])}', element, re.U)[0] return genus register(BablaIt)
#!/usr/bin/python # -*- coding: UTF-8 -*- import requests from lxml import html import re from lltk.scrapers import Verbix from lltk.scraping import register class VerbixEn(Verbix): def __init__(self, word): super(VerbixEn, self).__init__(word, 'en') register(VerbixEn)
if 'kein Plur' in element: # There is no plural result[1] = [''] else: # If a plural form exists, there is only one possibility result[1] = ['die'] return result @DictScraper._needs_elements def plural(self): ''' Tries to scrape the plural version from pons.eu. ''' element = self._first('NN') if element: if 'kein Plur' in element: # There is no plural return [''] if re.search(', ([\w|\s|/]+)>', element, re.U): # Plural form is provided return re.findall(', ([\w|\s|/]+)>', element, re.U)[0].split('/') if re.search(', -(\w+)>', element, re.U): # Suffix is provided suffix = re.findall(', -(\w+)>', element, re.U)[0] return [self.word + suffix] if element.endswith('->'): # Plural is the same as singular return [self.word] return [None] register(PonsDe)
# Remove duplicates result = list(set(result)) return result return [None] @TextScraper._needs_download def superlative(self): if 'JJ' in self.pos(): if self.tree.xpath( u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]' ): content = self._normalize( self.tree.xpath( u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]' )[0].getnext().text_content()) result = re.findall('Superlativ[\d|\s]*: ([\w|\s]+)', content, re.U) result = [x.strip() for x in result] # Remove duplicates result = list(set(result)) # Prepend "am " if necessary result = map( lambda x: 'am ' + x if not x.startswith('am ') else x, result) return result return [None] register(WiktionaryDe)
@DictScraper._needs_elements def pos(self, element=None): ''' Tries to decide about the part of speech. ''' tags = [] if element: if re.findall('\w+ {[m|f]}', element, re.U): tags.append('NN') if re.search('\w+ \[[\w|\|]+\]', element, re.U): tags.append('VB') if '{agg.}' in element: tags.append('JJ') else: for element in self.elements: if self.word in unicode(element): return self.pos(element) return tags @DictScraper._needs_elements def gender(self): ''' Tries to scrape the gender for a given noun from babl.la. ''' element = self._first('NN') if element: if re.search('{([m|f|n)])}', element, re.U): genus = re.findall('{([m|f|n)])}', element, re.U)[0] return genus register(BablaIt)
#!/usr/bin/python # -*- coding: UTF-8 -*- import requests from lxml import html import re from lltk.scrapers import Verbix from lltk.scraping import register class VerbixEs(Verbix): def __init__(self, word): super(VerbixEs, self).__init__(word, 'es') self.tenses['Present'] = 'Presente' self.tenses['Past'] = 'Pretérito imperfecto' self.tenses['Perfect'] = 'Pretérito perfecto compuesto' self.tenses['Pluperfect'] = 'Pretérito pluscuamperfecto' self.tenses['Future I'] = 'Futuro' self.tenses['Future II'] = 'Futuro perfecto' register(VerbixEs)
if re.search(' ([m|f]) ', content): genus = re.findall(' ([m|f]) ', content)[0] return genus # @TextScraper._needs_download # def articles(self): # result = [None, None] # if self.pos() == 'NN': # if self.tree.xpath('//table[contains(@class, "wikitable")]/tr'): # content = self.tree.xpath('//table[contains(@class, "wikitable")]/tr')[1].text_content() # singular, plural = content.split('\n')[1:3] # if singular.startswith(('der ', 'die ', 'das ')): # result[0] = singular.split(' ')[0] # if plural.startswith(('der ', 'die ', 'das ')): # result[1] = plural.split(' ')[0] # return result # @TextScraper._needs_download # def plural(self): # if self.pos() == 'NN': # if self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'): # content = self._normalize(self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]')[0].getnext().text_content()) # result = re.findall('Plural[\d|\s]*: ([\w|\s]+)', content, re.U) # result = [x.strip() for x in result] # return result # return [None] register(WiktionaryIt)
#!/usr/bin/python # -*- coding: UTF-8 -*- import requests from lxml import html import re from lltk.scrapers import Verbix from lltk.scraping import register class VerbixIt(Verbix): def __init__(self, word): super(VerbixIt, self).__init__(word, 'it') self.tenses['Present'] = 'Presente' self.tenses['Past'] = 'Imperfetto' self.tenses['Perfect'] = 'Passato prossimo' self.tenses['Pluperfect'] = 'Trapassato prossimo' self.tenses['Future I'] = 'Futuro' self.tenses['Future II'] = 'Futuro anteriore' register(VerbixIt)
# There is a plural form result[1] = 'de' else: # There is no plural form result[1] = '' return result @TextScraper._needs_download def plural(self): if 'NN' in self.pos(): content = self.tree.xpath('//div[@class="grad733100"]/h2[@class="inline"]')[0].text_content() info, content = '', content.strip('I ') article, word = content.split(' ')[0], ''.join(content.split(' ')[1:]) if self.tree.xpath('//div[@class="grad733100"]/table'): info = self.tree.xpath('//div[@class="grad733100"]/table')[0].text_content().encode('latin-1') if re.search('-(\w+) \(meerv.\)', info, re.U): # Suffix is provided suffix = re.findall('-(\w+) \(meerv.\)', info, re.U)[0].strip() return [word + suffix] elif re.search('([\w|\s]+) \(meerv.\)', info, re.U): # Plural form is provided result = re.findall('([\w|\s]+) \(meerv.\)', info, re.U)[0].strip() return [result] else: # There is no plural return [''] return [None] register(MijnWoordenBoekNl)
@TextScraper._needs_download def comparative(self): if 'JJ' in self.pos(): if self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'): content = self._normalize(self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]')[0].getnext().text_content()) result = re.findall('Komparativ[\d|\s]*: ([\w|\s]+)', content, re.U) result = [x.strip() for x in result] # Remove duplicates result = list(set(result)) return result return [None] @TextScraper._needs_download def superlative(self): if 'JJ' in self.pos(): if self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'): content = self._normalize(self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]')[0].getnext().text_content()) result = re.findall('Superlativ[\d|\s]*: ([\w|\s]+)', content, re.U) result = [x.strip() for x in result] # Remove duplicates result = list(set(result)) # Prepend "am " if necessary result = map(lambda x: 'am ' + x if not x.startswith('am ') else x, result) return result return [None] register(WiktionaryDe)