Ejemplos de Lemmatizer.lemmatize en Python, ejemplos de lemmagen.lemmatizer.Lemmatizer.lemmatize en Python

Ejemplo n.º 1

0

Mostrar archivo

def arrangeLemmatizedData(input, lemmatization=False, reverse=False):
    dd = defaultdict(list)
    with openio(input, encoding='utf8') as f:
        for line in f:
            line = line.split()
            source, target, score = line[0], line[1], line[2]
            source = source.strip('`’“„,‘')
            target = target.strip('`’“„,‘')
            if lemmatization and not reverse:
                lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH)
                source = lemmatizer_en.lemmatize(source)
                lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
                target= lemmatizer_sl.lemmatize(target)
            elif lemmatization and reverse:
                lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
                source = lemmatizer_sl.lemmatize(source)
                lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH)
                target = lemmatizer_en.lemmatize(target)

            dd[source].append((target, score))

    for k, v in dd.items():
        v = sorted(v, key=lambda tup: float(tup[1]), reverse = True)
        new_v = []
        for word, p in v:
            if (len(k) < 4 and len(word) > 5) or (len(word) < 4 and len(k) > 5):
                continue
            if float(p) < 0.05:
                continue
            new_v.append((word, p))
        dd[k] = new_v
    return dd

Ejemplo n.º 2

0

Mostrar archivo

Archivo: summarizer.py Proyecto: izacus/newsbuddy

class Summarizer():
    def __init__(self):
        this_dir = os.path.dirname(os.path.abspath(__file__))
        self.lemmatizer = Lemmatizer()
        dir = os.path.join(this_dir, "tokenizers/slovene.pickle")
        self.sent_detector = nltk.data.load("file://" + dir)

        self.stopwords = open(
            os.path.join(this_dir, "tokenizers/stopwords.txt"),
            "rb").read().splitlines()
        self.stopwords = filter(lambda w: not w.startswith("#"),
                                self.stopwords)
        # Convert to unicode
        self.stopwords = [word.decode("utf-8") for word in self.stopwords]

    def summarize(self,
                  article_text,
                  num_sentences=DEFAULT_SUMMARIZATION_NUMBER):

        # Get words from article
        words = word_tokenize(article_text)

        # Filter non-alphanumeric chars from words
        words = [filter(unicode.isalnum, word) for word in words]
        words = filter(lambda w: len(w) > 0, words)  # Remove empty words

        # Now lemmatize all words
        words = [
            self.lemmatizer.lemmatize(word).lower() for word in words
            if word.lower() not in self.stopwords
        ]
        word_frequencies = FreqDist(words)
        most_frequent = [word[0] for word in word_frequencies.items()[:100]]

        # Now get sentences
        sentences = self.sent_detector.tokenize(article_text)

        wordcountdict = defaultdict(int)

        for word in most_frequent:
            lem_word = self.lemmatizer.lemmatize(word).lower()
            for i in range(0, len(sentences)):
                if lem_word in sentences[i]:
                    wordcountdict[i] += 1

        sorted_wordcounts = sorted(wordcountdict.iteritems(),
                                   key=operator.itemgetter(1),
                                   reverse=True)[:num_sentences]
        return [sentences[num] for num, count in sorted_wordcounts]

Ejemplo n.º 3

0

Mostrar archivo

Archivo: views.py Proyecto: Ardweaden/LemmatizerApp

def index(request,phrase_list):
	print("INDEX with phrase list:",phrase_list)
	a = Lemmatizer()
	lemmatised_list = []

	for i,word in enumerate(phrase_list):
		if i:
			lemmatised_list.append(a.lemmatize(word) + " ")

	return Response(json.dumps(lemmatised_list))

Ejemplo n.º 4

0

Mostrar archivo

class Preprocessing:
    def __init__(self):
        self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
        self.punc_regex = re.compile(r'^[^0-9a-zA-Z]+$')

    def preprocess(self, text, raw=False, keep_stop_words=False):
        # Tokenize
        tokens = word_tokenize(text)

        if not raw:
            # Lemmatize
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]

            # Convert to lowercase
            tokens = [t.lower() for t in tokens]

        if not keep_stop_words:
            # Remove stopwords and punctuations
            tokens = [
                t for t in tokens
                if t not in stop_words_slovene and not self.punc_regex.match(t)
            ]

        return tokens

Ejemplo n.º 5

0

Mostrar archivo