Esempio n. 1
0
def index(request,phrase_list):
	print("INDEX with phrase list:",phrase_list)
	a = Lemmatizer()
	lemmatised_list = []

	for i,word in enumerate(phrase_list):
		if i:
			lemmatised_list.append(a.lemmatize(word) + " ")

	return Response(json.dumps(lemmatised_list))
Esempio n. 2
0
    def __init__(self):
        this_dir = os.path.dirname(os.path.abspath(__file__))
        self.lemmatizer = Lemmatizer()
        dir = os.path.join(this_dir, "tokenizers/slovene.pickle")
        self.sent_detector = nltk.data.load("file://" + dir)

        self.stopwords = open(
            os.path.join(this_dir, "tokenizers/stopwords.txt"),
            "rb").read().splitlines()
        self.stopwords = filter(lambda w: not w.startswith("#"),
                                self.stopwords)
        # Convert to unicode
        self.stopwords = [word.decode("utf-8") for word in self.stopwords]
Esempio n. 3
0
class Summarizer():
    def __init__(self):
        this_dir = os.path.dirname(os.path.abspath(__file__))
        self.lemmatizer = Lemmatizer()
        dir = os.path.join(this_dir, "tokenizers/slovene.pickle")
        self.sent_detector = nltk.data.load("file://" + dir)

        self.stopwords = open(
            os.path.join(this_dir, "tokenizers/stopwords.txt"),
            "rb").read().splitlines()
        self.stopwords = filter(lambda w: not w.startswith("#"),
                                self.stopwords)
        # Convert to unicode
        self.stopwords = [word.decode("utf-8") for word in self.stopwords]

    def summarize(self,
                  article_text,
                  num_sentences=DEFAULT_SUMMARIZATION_NUMBER):

        # Get words from article
        words = word_tokenize(article_text)

        # Filter non-alphanumeric chars from words
        words = [filter(unicode.isalnum, word) for word in words]
        words = filter(lambda w: len(w) > 0, words)  # Remove empty words

        # Now lemmatize all words
        words = [
            self.lemmatizer.lemmatize(word).lower() for word in words
            if word.lower() not in self.stopwords
        ]
        word_frequencies = FreqDist(words)
        most_frequent = [word[0] for word in word_frequencies.items()[:100]]

        # Now get sentences
        sentences = self.sent_detector.tokenize(article_text)

        wordcountdict = defaultdict(int)

        for word in most_frequent:
            lem_word = self.lemmatizer.lemmatize(word).lower()
            for i in range(0, len(sentences)):
                if lem_word in sentences[i]:
                    wordcountdict[i] += 1

        sorted_wordcounts = sorted(wordcountdict.iteritems(),
                                   key=operator.itemgetter(1),
                                   reverse=True)[:num_sentences]
        return [sentences[num] for num, count in sorted_wordcounts]
Esempio n. 4
0
 def __init__(self, lemmatize=True):
     self.debug = False
     self.stemmer = PorterStemmer()
     self.lemmatizer = WordNetLemmatizer()
     self.estLemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ESTONIAN)
     self.lemmatize = lemmatize
     self.stopwords = self.get_stopwords()
Esempio n. 5
0
def _get_lemmatizer(language: str) -> Callable:
    if language in lemmagen_languages:
        return Lemmatizer(
            dictionary=lemmagen_languages[language.lower()]
        ).lemmatize
    else:
        return get_udipipe_lematizer(language)
Esempio n. 6
0
def lemmatizeTokens(tokens):

    lemmatized_tokens = []
    lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)

    for token in tokens:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))

    return lemmatized_tokens
Esempio n. 7
0
def removeStopWordsAndLemmatisation(tokens):

    new_content = ""

    stop_words = set(stopwords.words('slovene'))
    for token in tokens:
        if type(token) == tuple:
            x = token[0]
        else:
            x = token
        # broken library / issue slovenian words have whitespaces behind
        x = x + ' '

        if x.lower() not in stop_words:
            x = x.strip()
            lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
            lemmanizedWord = lemmatizer.lemmatize(x)
            new_content += lemmanizedWord + " "

    return new_content
Esempio n. 8
0
class Preprocessing:
    def __init__(self):
        self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
        self.punc_regex = re.compile(r'^[^0-9a-zA-Z]+$')

    def preprocess(self, text, raw=False, keep_stop_words=False):
        # Tokenize
        tokens = word_tokenize(text)

        if not raw:
            # Lemmatize
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]

            # Convert to lowercase
            tokens = [t.lower() for t in tokens]

        if not keep_stop_words:
            # Remove stopwords and punctuations
            tokens = [
                t for t in tokens
                if t not in stop_words_slovene and not self.punc_regex.match(t)
            ]

        return tokens
 def test_emptystring(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("")
     self.assertEqual("", lemmatized)
 def test_utf8lemmatize(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("čistijo")
     self.assertEqual("čistiti", lemmatized)
Esempio n. 11
0
import argparse
import nltk
from itertools import groupby
#import gensim
#from gensim.models.doc2vec import TaggedDocument
#from experimentation import compress

import resource

rsrc = resource.RLIMIT_AS
soft, hard = resource.getrlimit(rsrc)
resource.setrlimit(
    rsrc, (13500000000, hard))  #limit allowed Python memory usage to 13GB

start_time = time.time()
lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH)
chkr = SpellChecker("en_US")


def variety_words():
    l_us = read_wordList('word_lists/en_US.dic')
    l_ca = read_wordList('word_lists/en_CA.dic')
    l_au = read_wordList('word_lists/en_AU.dic')
    l_all = l_us & l_ca & l_au
    l_just_us = l_us - l_all
    l_just_ca = l_ca - l_all
    l_just_au = l_au - l_all
    return (l_just_us, l_just_ca, l_just_au)


def generate_output(path, author_id, lang, variety, gender):
        result.append(string)
    return result


sl = []
en = []

with open('AGIF_small.tmx') as fp:
    xml = bs(fp, 'lxml-xml')
    for cnt, tuv in enumerate(xml.body.find_all('tuv')):
        if tuv.get('xml:lang') == 'en-GB':
            text = tuv.seg.getText().replace('\\n', ' ').replace(
                '\n', ' ').replace('\u2028', ' ').replace('\t', ' ').strip()
            text = re.sub('\\.+', '.', text)
            text = ' '.join(text.split()).lower()
            en.append(text)
        elif tuv.get('xml:lang') == 'sl-SI':
            text = tuv.seg.getText().replace('\\n', ' ').replace(
                '\n', ' ').replace('\u2028', ' ').replace('\t', ' ').strip()
            text = re.sub('\\.+', '.', text)
            text = ' '.join(text.split()).lower()
            sl.append(text)

lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH)
lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)

sl_lemmas = get_lemmas(sl, lemmatizer_sl)

for el in sl_lemmas:
    print(el)
Esempio n. 13
0
 def test_lemmatize(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize(str("hodimo"))
     self.assertEqual(str("hoditi"), lemmatized)
Esempio n. 14
0
 def test_null(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize(None)
     self.assertEqual(None, lemmatized)
Esempio n. 15
0
 def test_punctuation(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("!\"=`.,/:")
     self.assertEqual("!\"=`.,/:", lemmatized)
class Lemmatization():
    def __init__(self):
        self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)

    def lemmatize(self, token):
          return self.lemmatizer.lemmatize(token)
Esempio n. 17
0
def createLemmatizedFeatures(data, giza_dict, giza_dict_reversed, cognates=False):
    lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH)
    data['src_term_lemma'] = data['src_term'].map(lambda x: lemmatize(x, lemmatizer_en))
    lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
    data['tar_term_lemma'] = data['tar_term'].map(lambda x: lemmatize(x, lemmatizer_sl))
    data['term_pair_lemma'] = data['src_term_lemma'] + '\t' + data['tar_term_lemma']

    data['isFirstWordTranslated'] = data['term_pair_lemma'].map(lambda x: isFirstWordTranslated(x, giza_dict))
    data['isLastWordTranslated'] = data['term_pair_lemma'].map(lambda x: isLastWordTranslated(x, giza_dict))
    data['percentageOfTranslatedWords'] = data['term_pair_lemma'].map(lambda x: percentageOfTranslatedWords(x, giza_dict))
    data['percentageOfNotTranslatedWords'] = data['term_pair_lemma'].map(lambda x: percentageOfNotTranslatedWords(x, giza_dict))
    data['longestTranslatedUnitInPercentage'] = data['term_pair_lemma'].map(lambda x: longestTranslatedUnitInPercentage(x, giza_dict))
    data['longestNotTranslatedUnitInPercentage'] = data['term_pair_lemma'].map(lambda x: longestNotTranslatedUnitInPercentage(x, giza_dict))

    data['term_pair_lemma'] = data['tar_term_lemma'] + '\t' + data['src_term_lemma']

    data['isFirstWordTranslated_reversed'] = data['term_pair_lemma'].map(lambda x: isFirstWordTranslated(x, giza_dict_reversed))
    data['isLastWordTranslated_reversed'] = data['term_pair_lemma'].map(lambda x: isLastWordTranslated(x, giza_dict_reversed))
    data['percentageOfTranslatedWords_reversed'] = data['term_pair_lemma'].map(lambda x: percentageOfTranslatedWords(x, giza_dict_reversed))
    data['percentageOfNotTranslatedWords_reversed'] = data['term_pair_lemma'].map(lambda x: percentageOfNotTranslatedWords(x, giza_dict_reversed))
    data['longestTranslatedUnitInPercentage_reversed'] = data['term_pair_lemma'].map(lambda x: longestTranslatedUnitInPercentage(x, giza_dict_reversed))
    data['longestNotTranslatedUnitInPercentage_reversed'] = data['term_pair_lemma'].map(lambda x: longestNotTranslatedUnitInPercentage(x, giza_dict_reversed))
    
    data['src_term_tr'] = data['src_term'].map(lambda x: transcribe(x, 'en'))
    data['tar_term_tr'] = data['tar_term'].map(lambda x: transcribe(x, 'sl'))
    data['term_pair_tr'] = data['src_term_tr'] + '\t' + data['tar_term_tr']
    data['term_pair'] = data['src_term'] + '\t' + data['tar_term']

    if cognates:
        data['isFirstWordCognate'] = data['term_pair_tr'].map(lambda x: isWordCognate(x, 0))
        data['isLastWordCognate'] = data['term_pair_tr'].map(lambda x: isWordCognate(x, -1))


    data['longestCommonSubstringRatio'] = data['term_pair_tr'].map(lambda x: float(len(longest_common_substring(x))) / max(len(x.split('\t')[0]), len(x.split('\t')[1])))
    data['longestCommonSubsequenceRatio'] = data['term_pair_tr'].map(lambda x: float(len(longest_common_subsequence(x))) / max(len(x.split('\t')[0]), len(x.split('\t')[1])))
    data['dice'] = data['term_pair_tr'].map(lambda x: (2 * float(len(longest_common_substring(x)))) / (len(x.split('\t')[0]) + len(x.split('\t')[1])))
    data['NWD'] = data['term_pair_tr'].map(lambda x: float(len(longest_common_substring(x))) / min(len(x.split('\t')[0]), len(x.split('\t')[1])))
    data['editDistanceNormalized'] = data['term_pair_tr'].map(lambda x: 1 - (float(editdistance.eval(x.split('\t')[0], x.split('\t')[1])) / max(len(x.split('\t')[0]), len(x.split('\t')[1]))))

    data['term_pair_lemma'] = data['src_term_lemma'] + '\t' + data['tar_term_lemma']

    data['isFirstWordCovered'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict, 0))
    data['isLastWordCovered'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict, -1))
    data['percentageOfCoverage'] = data['term_pair_lemma'].map(lambda x: percentageOfCoverage(x, giza_dict))
    data['percentageOfNonCoverage'] = data['term_pair_lemma'].map(lambda x: 1 -percentageOfCoverage(x, giza_dict))
    data['diffBetweenCoverageAndNonCoverage'] = data['percentageOfCoverage'] - data['percentageOfNonCoverage']

    if cognates:
        data['wordLengthMatch'] = data['term_pair'].map(lambda x: wordLengthMatch(x))
        data['sourceTermLength'] = data['term_pair'].map(lambda x: sourceTermLength(x))
        data['targetTermLength'] = data['term_pair'].map(lambda x: targetTermLength(x))

    data['term_pair_lemma'] = data['tar_term_lemma'] + '\t' + data['src_term_lemma']

    data['isFirstWordCovered_reversed'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict_reversed, 0))
    data['isLastWordCovered_reversed'] = data['term_pair_lemma'].map(lambda x: isLemmatizedWordCovered(x, giza_dict_reversed, -1))
    data['percentageOfCoverage_reversed'] = data['term_pair_lemma'].map(lambda x: percentageOfCoverage(x, giza_dict_reversed))
    data['percentageOfNonCoverage_reversed'] = data['term_pair_lemma'].map(lambda x: 1 - percentageOfCoverage(x, giza_dict_reversed))
    data['diffBetweenCoverageAndNonCoverage_reversed'] = data['percentageOfCoverage_reversed'] - data['percentageOfNonCoverage_reversed']

    data['averagePercentageOfTranslatedWords'] = (data['percentageOfTranslatedWords'] + data['percentageOfTranslatedWords_reversed']) / 2


    data = data.drop(['term_pair', 'term_pair_lemma', 'src_term_lemma', 'tar_term_lemma', 'term_pair_tr', 'src_term_tr', 'tar_term_tr'], axis = 1)

    #print('feature construction done')
    return data
 def test_lemmatize(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize(str("hodimo"))
     self.assertEqual(str("hoditi"), lemmatized)
 def test_punctuation(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("!\"=`.,/:")
     self.assertEqual("!\"=`.,/:", lemmatized)
 def __init__(self):
     self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
Esempio n. 21
0
 def __init__(self):
     self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
     self.punc_regex = re.compile(r'^[^0-9a-zA-Z]+$')
 def test_null(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize(None)
     self.assertEqual(None, lemmatized)
Esempio n. 23
0
 def test_utf8lemmatize(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("čistijo")
     self.assertEqual("čistiti", lemmatized)
import sys

sys.path.append("C:/Users/dis/Documents/JanJezersek/EkoSmart/pylemmagen")

from lemmagen.lemmatizer import Lemmatizer

a = Lemmatizer()

for i, word in enumerate(sys.argv):
    if i:
        sys.stdout.write(a.lemmatize(word) + " ")
Esempio n. 25
0
 def test_emptystring(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("")
     self.assertEqual("", lemmatized)
Esempio n. 26
0
def arrangeLemmatizedData(input, lemmatization=False, reverse=False):
    dd = defaultdict(list)
    with openio(input, encoding='utf8') as f:
        for line in f:
            line = line.split()
            source, target, score = line[0], line[1], line[2]
            source = source.strip('`’“„,‘')
            target = target.strip('`’“„,‘')
            if lemmatization and not reverse:
                lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH)
                source = lemmatizer_en.lemmatize(source)
                lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
                target= lemmatizer_sl.lemmatize(target)
            elif lemmatization and reverse:
                lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
                source = lemmatizer_sl.lemmatize(source)
                lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH)
                target = lemmatizer_en.lemmatize(target)

            dd[source].append((target, score))

    for k, v in dd.items():
        v = sorted(v, key=lambda tup: float(tup[1]), reverse = True)
        new_v = []
        for word, p in v:
            if (len(k) < 4 and len(word) > 5) or (len(word) < 4 and len(k) > 5):
                continue
            if float(p) < 0.05:
                continue
            new_v.append((word, p))
        dd[k] = new_v
    return dd
Esempio n. 27
0
class Tokenizer(object):
    def __init__(self, lemmatize=True):
        self.debug = False
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.estLemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ESTONIAN)
        self.lemmatize = lemmatize
        self.stopwords = self.get_stopwords()

    def get_stopwords(self):
        sw = StopWord()
        return set(sw.words)

    def lemstem(self, token):
        if self.lemmatize:
            return self.lemmatizer.lemmatize(token)
        else:
            return self.stemmer.stem(token)

    def extractTokens(self, text):
        try:
            tokens = word_tokenize(text)
        except UnicodeEncodeError:
            tokens = []

        if not tokens:
            return {}

        est_text = self.is_estonian(text)

        token_dict = {}
        for token in tokens:
            token = token.lower()

            # check if string consists of alphabetic characters only
            if not (token.isalpha() and len(token) > 2):
                continue

            try:
                if est_text:
                    lemstem_word = self.estLemmatizer.lemmatize(token)
                else:
                    lemstem_word = self.lemstem(token)
            except Exception:
                lemstem_word = token

            if lemstem_word not in self.stopwords:
                if self.debug:
                    print "{0}: {1}".format(token.encode('utf-8'), lemstem_word.encode('utf-8'))
                if token_dict.has_key(lemstem_word):
                    token_dict[lemstem_word] += 1
                else:
                    token_dict[lemstem_word] = 1

        return token_dict

    def is_estonian(self, text):
        est = False
        try:
            est = detect(text) == 'et'
        except Exception:
            pass
        return est

    def getLectureRecord(self, lectureId):
        try:
            data = Lecture.select().where(Lecture.id == lectureId).get()
            return data
        except Exception:
            return None

    def extractLectureTokens(self, lecture):
        if lecture is None:
            return False

        text = lecture.content
        tokens = self.extractTokens(text)
        sorted_tokens = sorted(tokens.items(), key=operator.itemgetter(1))

        for token in sorted_tokens:
            try:
                with db.transaction() as txn:
                    LectureWord.create(
                        lecture=lecture,
                        word=token[0],
                        count=token[1],
                        active=True,
                        weight=0
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for lecture {0}, word {1}, {2}".format(lecture.id, token[0], e)

            if self.debug:
                print token

        return True

    def getCourseRecord(self, courseId):
        try:
            data = Course.select().where(Course.id == courseId).get()
            return data
        except Exception:
            return None

    def getLectures(self, course):
        lectures = Lecture.select().where(Lecture.course == course)
        return list(lectures)

    def extractCourseTokens(self, lectures):
        print "Lecture count: {0}".format(len(lectures))
        for lecture in lectures:
            print "Lecture: {0}".format(lecture.id)
            self.extractLectureTokens(lecture)

    def getCourses(self, courseId=0):
        if courseId:
            courses = Course.select().where(Course.id == courseId)
        else:
            courses = Course.select()
        return list(courses)

    def extractAllCourseTokens(self):
        for course in self.getCourses():
            print course.id, course.name
            lectures = self.getLectures(course)
            self.extractCourseTokens(lectures)

    def getLectureWords(self, lecture):
        lectureWords = list(LectureWord.select().where(LectureWord.lecture == lecture))
        return lectureWords

    def createCourseTokens(self):
        for course in self.getCourses():
            print "{}: {}".format(course.id, course.name.encode('utf8'))
            token_dict = {}
            lecture_token = {}

            for lecture in self.getLectures(course):
                lectureWords = self.getLectureWords(lecture)
                for lectureWord in lectureWords:
                    if not token_dict.has_key(lectureWord.word):
                        token_dict[lectureWord.word] = 0
                        lecture_token[lectureWord.word] = 0

                    token_dict[lectureWord.word] += lectureWord.count
                    lecture_token[lectureWord.word] += 1
            sorted_tokens = sorted(token_dict.items(), key=operator.itemgetter(1))
            for token in sorted_tokens:
                try:
                    with db.transaction() as txn:
                        CourseWord.create(
                            course=course,
                            word=token[0],
                            count=token[1],
                            active=True,
                            lectures=lecture_token[token[0]]
                        )
                        txn.commit()
                except peewee.OperationalError as e:
                    print "Could not create a record for course {0}, word {1}, {2}".format(course.name.encode('utf8'),
                                                                                           token[0].encode('utf8'), e)

    def getCourseWords(self, courseId=0):
        if courseId == 0:
            courseWords = CourseWord.select()
        else:
            courseWords = CourseWord.select().where(CourseWord.course == courseId)
        return list(courseWords)

    def createCorpusTokens(self):
        token_dict = {}
        for courseWord in self.getCourseWords():
            if token_dict.has_key(courseWord.word):
                token_dict[courseWord.word] += courseWord.count
            else:
                token_dict[courseWord.word] = courseWord.count

        sorted_tokens = sorted(token_dict.items(), key=operator.itemgetter(1))
        for token in sorted_tokens:
            print token
            try:
                with db.transaction() as txn:
                    CorpusWord.create(
                        word=token[0],
                        count=token[1],
                        active=True
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for word {}, {}".format(token[0], e)

    def calc_tf(self):
        for course in self.getCourses(55):
            print course.name
            for lecture in self.getLectures(course):
                maxCount = 0
                for lectureWord in self.getLectureWords(lecture):
                    maxCount = max(maxCount, lectureWord.count)

                for lectureWord in self.getLectureWords(lecture):
                    try:
                        with db.transaction():
                            lectureWord.weight = 0.5 + (0.5 * lectureWord.count) / maxCount
                            lectureWord.save()
                    except peewee.OperationalError as e:
                        print e