Exemple #1
0
    @TextScraper._needs_download
    def plural(self):

        if 'NN' in self.pos():
            content = self.tree.xpath(
                '//div[@class="grad733100"]/h2[@class="inline"]'
            )[0].text_content()
            info, content = '', content.strip('I ')
            article, word = content.split(' ')[0], ''.join(
                content.split(' ')[1:])
            if self.tree.xpath('//div[@class="grad733100"]/table'):
                info = self.tree.xpath('//div[@class="grad733100"]/table'
                                       )[0].text_content().encode('latin-1')
            if re.search('-(\w+) \(meerv.\)', info, re.U):
                # Suffix is provided
                suffix = re.findall('-(\w+) \(meerv.\)', info, re.U)[0].strip()
                return [word + suffix]
            elif re.search('([\w|\s]+) \(meerv.\)', info, re.U):
                # Plural form is provided
                result = re.findall('([\w|\s]+) \(meerv.\)', info,
                                    re.U)[0].strip()
                return [result]
            else:
                # There is no plural
                return ['']
        return [None]


register(MijnWoordenBoekNl)
Exemple #2
0
    def pos(self, element=None):
        ''' Tries to decide about the part of speech. '''

        tags = []
        if element:
            if re.search('[\w|\s]+ [m|f]\.', element, re.U):
                tags.append('NN')
            if '[VERB]' in element:
                tags.append('VB')
            if 'adj.' in element and re.search('([\w|\s]+, [\w|\s]+)', element,
                                               re.U):
                tags.append('JJ')
        else:
            for element in self.elements:
                if element.startswith(self.word):
                    tags += self.pos(element)
        return list(set(tags))

    @DictScraper._needs_elements
    def gender(self):
        ''' Tries to scrape the gender for a given noun from leo.org. '''

        element = self._first('NN')
        if element:
            if re.search('([m|f|n)])\.', element, re.U):
                genus = re.findall('([m|f|n)])\.', element, re.U)[0]
                return genus


register(LeoIt)
Exemple #3
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register


class VerbixDe(Verbix):
    def __init__(self, word):

        super(VerbixDe, self).__init__(word, 'de')
        self.tenses['Future I'] = 'Future I'
        self.tenses['Future II'] = 'Future II'

    def _normalize(self, string):
        ''' Returns a sanitized string. '''

        string = super(VerbixDe, self)._normalize(string)
        string = string.replace('sie; Sie', 'sie')
        string = string.strip()
        return string


register(VerbixDe)
Exemple #4
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register

class VerbixNl(Verbix):

	def __init__(self, word):

		super(VerbixNl, self).__init__(word, 'nl')
		self.tenses['Perfect'] = 'Present Perfect'
		self.tenses['Pluperfect'] = 'Past Perfect'
		self.tenses['Future II'] = 'Future Perfect'

register(VerbixNl)
Exemple #5
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register


class VerbixNl(Verbix):
    def __init__(self, word):

        super(VerbixNl, self).__init__(word, 'nl')
        self.tenses['Perfect'] = 'Present Perfect'
        self.tenses['Pluperfect'] = 'Past Perfect'
        self.tenses['Future II'] = 'Future Perfect'


register(VerbixNl)
Exemple #6
0
Fichier : leo.py Projet : lltk/lltk
	@DictScraper._needs_elements
	def pos(self, element = None):
		''' Tries to decide about the part of speech. '''

		tags = []
		if element:
			if re.search('[\w|\s]+ [m|f]\.', element, re.U):
				tags.append('NN')
			if '[VERB]' in element:
				tags.append('VB')
			if 'adj.' in element and re.search('([\w|\s]+, [\w|\s]+)', element, re.U):
				tags.append('JJ')
		else:
			for element in self.elements:
				if element.startswith(self.word):
					tags += self.pos(element)
		return list(set(tags))

	@DictScraper._needs_elements
	def gender(self):
		''' Tries to scrape the gender for a given noun from leo.org. '''

		element = self._first('NN')
		if element:
			if re.search('([m|f|n)])\.', element, re.U):
				genus = re.findall('([m|f|n)])\.', element, re.U)[0]
				return genus

register(LeoIt)
Exemple #7
0
	@DictScraper._needs_elements
	def plural(self):
		''' Tries to scrape the plural version from uitmuntend.nl. '''

		element = self._first('NN')
		if element:
			element = element.split('\r\n')[0]
			if ' | ' in element:
				# This means there is a plural
				singular, plural = element.split(' | ')
				return [plural.split(' ')[1]]
			else:
				# This means there is no plural
				return ['']
		return [None]

	@DictScraper._needs_elements
	def gender(self):
		''' Tries to scrape the gender for a given noun from uitmuntend.nl. '''

		element = self._first('NN')
		if element:
			element = element.split('\r\n')[0]
			if re.search(r' \[([m|f])\]', element, re.U):
				genus = re.findall(r' \[([m|f])\]', element, re.U)[0]
				return genus
			return 'n'

register(UitmuntendNl)
Exemple #8
0
		return result

	@DictScraper._needs_elements
	def plural(self):
		''' Tries to scrape the plural version from vandale.nl. '''

		element = self._first('NN')
		if element:
			if re.search('meervoud: ([\w|\s|\'|\-|,]+)', element, re.U):
				results = re.search('meervoud: ([\w|\s|\'|\-|,]+)', element, re.U).groups()[0].split(', ')
				results = [x.replace('ook ', '').strip() for x in results]
				return results
			else:
				# There is no plural form
				return ['']
		return [None]

	@DictScraper._needs_elements
	def miniaturize(self):
		''' Tries to scrape the miniaturized version from vandale.nl. '''

		element = self._first('NN')
		if element:
			if re.search('verkleinwoord: (\w+)', element, re.U):
				return re.findall('verkleinwoord: (\w+)', element, re.U)
			else:
				return ['']
		return [None]

register(VandaleNl)
Exemple #9
0
                genus = re.findall(' ([m|f]) ', content)[0]
                return genus


#	@TextScraper._needs_download
#	def articles(self):

#		result = [None, None]
#		if self.pos() == 'NN':
#			if self.tree.xpath('//table[contains(@class, "wikitable")]/tr'):
#				content = self.tree.xpath('//table[contains(@class, "wikitable")]/tr')[1].text_content()
#				singular, plural = content.split('\n')[1:3]
#				if singular.startswith(('der ', 'die ', 'das ')):
#					result[0] = singular.split(' ')[0]
#				if plural.startswith(('der ', 'die ', 'das ')):
#					result[1] = plural.split(' ')[0]
#		return result

#	@TextScraper._needs_download
#	def plural(self):

#		if self.pos() == 'NN':
#			if self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'):
#				content = self._normalize(self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]')[0].getnext().text_content())
#				result = re.findall('Plural[\d|\s]*: ([\w|\s]+)', content, re.U)
#				result = [x.strip() for x in result]
#				return result
#		return [None]

register(WiktionaryIt)
Exemple #10
0
import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register

class VerbixFr(Verbix):

	def __init__(self, word):

		super(VerbixFr, self).__init__(word, 'fr')
		self.tenses['Present'] = 'Présent'
		self.tenses['Past'] = 'Imparfait'
		self.tenses['Perfect'] = 'Passé composé'
		self.tenses['Pluperfect'] = 'Plus-que-parfait'
		self.tenses['Future I'] = 'Futur simple'
		self.tenses['Future II'] = 'Futur antérieur'

	def _normalize(self, string):
		''' Returns a sanitized string. '''

		string = super(VerbixFr, self)._normalize(string)
		string = string.replace('il; elle', 'il/elle')
		string = string.replace('ils; elles', 'ils/elles')
		string = string.strip()
		return string

register(VerbixFr)
Exemple #11
0
	@DictScraper._needs_elements
	def pos(self, element = None):
		''' Tries to decide about the part of speech. '''

		tags = []
		if element:
			if re.findall('\w+ {[m|f]}', element, re.U):
				tags.append('NN')
			if re.search('\w+ \[[\w|\|]+\]', element, re.U):
				tags.append('VB')
			if '{agg.}' in element:
				tags.append('JJ')
		else:
			for element in self.elements:
				if self.word in unicode(element):
					return self.pos(element)
		return tags

	@DictScraper._needs_elements
	def gender(self):
		''' Tries to scrape the gender for a given noun from babl.la. '''

		element = self._first('NN')
		if element:
			if re.search('{([m|f|n)])}', element, re.U):
				genus = re.findall('{([m|f|n)])}', element, re.U)[0]
				return genus

register(BablaIt)
Exemple #12
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register

class VerbixEn(Verbix):

	def __init__(self, word):

		super(VerbixEn, self).__init__(word, 'en')

register(VerbixEn)
Exemple #13
0
			if 'kein Plur' in element:
				# There is no plural
				result[1] = ['']
			else:
				# If a plural form exists, there is only one possibility
				result[1] = ['die']
		return result

	@DictScraper._needs_elements
	def plural(self):
		''' Tries to scrape the plural version from pons.eu. '''

		element = self._first('NN')
		if element:
			if 'kein Plur' in element:
				# There is no plural
				return ['']
			if re.search(', ([\w|\s|/]+)>', element, re.U):
				# Plural form is provided
				return re.findall(', ([\w|\s|/]+)>', element, re.U)[0].split('/')
			if re.search(', -(\w+)>', element, re.U):
				# Suffix is provided
				suffix = re.findall(', -(\w+)>', element, re.U)[0]
				return [self.word + suffix]
			if element.endswith('->'):
				# Plural is the same as singular
				return [self.word]
		return [None]

register(PonsDe)
Exemple #14
0
    @DictScraper._needs_elements
    def plural(self):
        ''' Tries to scrape the plural version from uitmuntend.nl. '''

        element = self._first('NN')
        if element:
            element = element.split('\r\n')[0]
            if ' | ' in element:
                # This means there is a plural
                singular, plural = element.split(' | ')
                return [plural.split(' ')[1]]
            else:
                # This means there is no plural
                return ['']
        return [None]

    @DictScraper._needs_elements
    def gender(self):
        ''' Tries to scrape the gender for a given noun from uitmuntend.nl. '''

        element = self._first('NN')
        if element:
            element = element.split('\r\n')[0]
            if re.search(r' \[([m|f])\]', element, re.U):
                genus = re.findall(r' \[([m|f])\]', element, re.U)[0]
                return genus
            return 'n'


register(UitmuntendNl)
Exemple #15
0
                # Remove duplicates
                result = list(set(result))
                return result
        return [None]

    @TextScraper._needs_download
    def superlative(self):

        if 'JJ' in self.pos():
            if self.tree.xpath(
                    u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'
            ):
                content = self._normalize(
                    self.tree.xpath(
                        u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'
                    )[0].getnext().text_content())
                result = re.findall('Superlativ[\d|\s]*: ([\w|\s]+)', content,
                                    re.U)
                result = [x.strip() for x in result]
                # Remove duplicates
                result = list(set(result))
                # Prepend "am " if necessary
                result = map(
                    lambda x: 'am ' + x
                    if not x.startswith('am ') else x, result)
                return result
        return [None]


register(WiktionaryDe)
Exemple #16
0
    @DictScraper._needs_elements
    def pos(self, element=None):
        ''' Tries to decide about the part of speech. '''

        tags = []
        if element:
            if re.findall('\w+ {[m|f]}', element, re.U):
                tags.append('NN')
            if re.search('\w+ \[[\w|\|]+\]', element, re.U):
                tags.append('VB')
            if '{agg.}' in element:
                tags.append('JJ')
        else:
            for element in self.elements:
                if self.word in unicode(element):
                    return self.pos(element)
        return tags

    @DictScraper._needs_elements
    def gender(self):
        ''' Tries to scrape the gender for a given noun from babl.la. '''

        element = self._first('NN')
        if element:
            if re.search('{([m|f|n)])}', element, re.U):
                genus = re.findall('{([m|f|n)])}', element, re.U)[0]
                return genus


register(BablaIt)
Exemple #17
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register


class VerbixEs(Verbix):
    def __init__(self, word):

        super(VerbixEs, self).__init__(word, 'es')
        self.tenses['Present'] = 'Presente'
        self.tenses['Past'] = 'Pretérito imperfecto'
        self.tenses['Perfect'] = 'Pretérito perfecto compuesto'
        self.tenses['Pluperfect'] = 'Pretérito pluscuamperfecto'
        self.tenses['Future I'] = 'Futuro'
        self.tenses['Future II'] = 'Futuro perfecto'


register(VerbixEs)
Exemple #18
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register

class VerbixDe(Verbix):

	def __init__(self, word):

		super(VerbixDe, self).__init__(word, 'de')
		self.tenses['Future I'] = 'Future I'
		self.tenses['Future II'] = 'Future II'

	def _normalize(self, string):
		''' Returns a sanitized string. '''

		string = super(VerbixDe, self)._normalize(string)
		string = string.replace('sie; Sie', 'sie')
		string = string.strip()
		return string

register(VerbixDe)
Exemple #19
0
			if re.search(' ([m|f]) ', content):
				genus = re.findall(' ([m|f]) ', content)[0]
				return genus

#	@TextScraper._needs_download
#	def articles(self):

#		result = [None, None]
#		if self.pos() == 'NN':
#			if self.tree.xpath('//table[contains(@class, "wikitable")]/tr'):
#				content = self.tree.xpath('//table[contains(@class, "wikitable")]/tr')[1].text_content()
#				singular, plural = content.split('\n')[1:3]
#				if singular.startswith(('der ', 'die ', 'das ')):
#					result[0] = singular.split(' ')[0]
#				if plural.startswith(('der ', 'die ', 'das ')):
#					result[1] = plural.split(' ')[0]
#		return result

#	@TextScraper._needs_download
#	def plural(self):

#		if self.pos() == 'NN':
#			if self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'):
#				content = self._normalize(self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]')[0].getnext().text_content())
#				result = re.findall('Plural[\d|\s]*: ([\w|\s]+)', content, re.U)
#				result = [x.strip() for x in result]
#				return result
#		return [None]

register(WiktionaryIt)
Exemple #20
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register

class VerbixEs(Verbix):

	def __init__(self, word):

		super(VerbixEs, self).__init__(word, 'es')
		self.tenses['Present'] = 'Presente'
		self.tenses['Past'] = 'Pretérito imperfecto'
		self.tenses['Perfect'] = 'Pretérito perfecto compuesto'
		self.tenses['Pluperfect'] = 'Pretérito pluscuamperfecto'
		self.tenses['Future I'] = 'Futuro'
		self.tenses['Future II'] = 'Futuro perfecto'

register(VerbixEs)
Exemple #21
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register


class VerbixIt(Verbix):
    def __init__(self, word):

        super(VerbixIt, self).__init__(word, 'it')
        self.tenses['Present'] = 'Presente'
        self.tenses['Past'] = 'Imperfetto'
        self.tenses['Perfect'] = 'Passato prossimo'
        self.tenses['Pluperfect'] = 'Trapassato prossimo'
        self.tenses['Future I'] = 'Futuro'
        self.tenses['Future II'] = 'Futuro anteriore'


register(VerbixIt)
Exemple #22
0
				# There is a plural form
				result[1] = 'de'
			else:
				# There is no plural form
				result[1] = ''
		return result

	@TextScraper._needs_download
	def plural(self):

		if 'NN' in self.pos():
			content = self.tree.xpath('//div[@class="grad733100"]/h2[@class="inline"]')[0].text_content()
			info, content = '', content.strip('I ')
			article, word = content.split(' ')[0], ''.join(content.split(' ')[1:])
			if self.tree.xpath('//div[@class="grad733100"]/table'):
				info = self.tree.xpath('//div[@class="grad733100"]/table')[0].text_content().encode('latin-1')
			if re.search('-(\w+) \(meerv.\)', info, re.U):
				# Suffix is provided
				suffix = re.findall('-(\w+) \(meerv.\)', info, re.U)[0].strip()
				return [word + suffix]
			elif re.search('([\w|\s]+) \(meerv.\)', info, re.U):
				# Plural form is provided
				result = re.findall('([\w|\s]+) \(meerv.\)', info, re.U)[0].strip()
				return [result]
			else:
				# There is no plural
				return ['']
		return [None]

register(MijnWoordenBoekNl)
Exemple #23
0
	@TextScraper._needs_download
	def comparative(self):

		if 'JJ' in self.pos():
			if self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'):
				content = self._normalize(self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]')[0].getnext().text_content())
				result = re.findall('Komparativ[\d|\s]*: ([\w|\s]+)', content, re.U)
				result = [x.strip() for x in result]
				# Remove duplicates
				result = list(set(result))
				return result
		return [None]

	@TextScraper._needs_download
	def superlative(self):

		if 'JJ' in self.pos():
			if self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]'):
				content = self._normalize(self.tree.xpath(u'//div[@id="mw-content-text"]/p[@title="Trennungsmöglichkeiten am Zeilenumbruch"]')[0].getnext().text_content())
				result = re.findall('Superlativ[\d|\s]*: ([\w|\s]+)', content, re.U)
				result = [x.strip() for x in result]
				# Remove duplicates
				result = list(set(result))
				# Prepend "am " if necessary
				result = map(lambda x: 'am ' + x if not x.startswith('am ') else x, result)
				return result
		return [None]

register(WiktionaryDe)
Exemple #24
0
import requests
from lxml import html
import re

from lltk.scrapers import Verbix
from lltk.scraping import register


class VerbixFr(Verbix):
    def __init__(self, word):

        super(VerbixFr, self).__init__(word, 'fr')
        self.tenses['Present'] = 'Présent'
        self.tenses['Past'] = 'Imparfait'
        self.tenses['Perfect'] = 'Passé composé'
        self.tenses['Pluperfect'] = 'Plus-que-parfait'
        self.tenses['Future I'] = 'Futur simple'
        self.tenses['Future II'] = 'Futur antérieur'

    def _normalize(self, string):
        ''' Returns a sanitized string. '''

        string = super(VerbixFr, self)._normalize(string)
        string = string.replace('il; elle', 'il/elle')
        string = string.replace('ils; elles', 'ils/elles')
        string = string.strip()
        return string


register(VerbixFr)