Ejemplo n.º 1
0
def arrangeLemmatizedData(input, lemmatization=False, reverse=False):
    dd = defaultdict(list)
    with openio(input, encoding='utf8') as f:
        for line in f:
            line = line.split()
            source, target, score = line[0], line[1], line[2]
            source = source.strip('`’“„,‘')
            target = target.strip('`’“„,‘')
            if lemmatization and not reverse:
                lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH)
                source = lemmatizer_en.lemmatize(source)
                lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
                target= lemmatizer_sl.lemmatize(target)
            elif lemmatization and reverse:
                lemmatizer_sl = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
                source = lemmatizer_sl.lemmatize(source)
                lemmatizer_en = Lemmatizer(dictionary=lemmagen.DICTIONARY_ENGLISH)
                target = lemmatizer_en.lemmatize(target)

            dd[source].append((target, score))

    for k, v in dd.items():
        v = sorted(v, key=lambda tup: float(tup[1]), reverse = True)
        new_v = []
        for word, p in v:
            if (len(k) < 4 and len(word) > 5) or (len(word) < 4 and len(k) > 5):
                continue
            if float(p) < 0.05:
                continue
            new_v.append((word, p))
        dd[k] = new_v
    return dd
Ejemplo n.º 2
0
class Summarizer():
    def __init__(self):
        this_dir = os.path.dirname(os.path.abspath(__file__))
        self.lemmatizer = Lemmatizer()
        dir = os.path.join(this_dir, "tokenizers/slovene.pickle")
        self.sent_detector = nltk.data.load("file://" + dir)

        self.stopwords = open(
            os.path.join(this_dir, "tokenizers/stopwords.txt"),
            "rb").read().splitlines()
        self.stopwords = filter(lambda w: not w.startswith("#"),
                                self.stopwords)
        # Convert to unicode
        self.stopwords = [word.decode("utf-8") for word in self.stopwords]

    def summarize(self,
                  article_text,
                  num_sentences=DEFAULT_SUMMARIZATION_NUMBER):

        # Get words from article
        words = word_tokenize(article_text)

        # Filter non-alphanumeric chars from words
        words = [filter(unicode.isalnum, word) for word in words]
        words = filter(lambda w: len(w) > 0, words)  # Remove empty words

        # Now lemmatize all words
        words = [
            self.lemmatizer.lemmatize(word).lower() for word in words
            if word.lower() not in self.stopwords
        ]
        word_frequencies = FreqDist(words)
        most_frequent = [word[0] for word in word_frequencies.items()[:100]]

        # Now get sentences
        sentences = self.sent_detector.tokenize(article_text)

        wordcountdict = defaultdict(int)

        for word in most_frequent:
            lem_word = self.lemmatizer.lemmatize(word).lower()
            for i in range(0, len(sentences)):
                if lem_word in sentences[i]:
                    wordcountdict[i] += 1

        sorted_wordcounts = sorted(wordcountdict.iteritems(),
                                   key=operator.itemgetter(1),
                                   reverse=True)[:num_sentences]
        return [sentences[num] for num, count in sorted_wordcounts]
Ejemplo n.º 3
0
def index(request,phrase_list):
	print("INDEX with phrase list:",phrase_list)
	a = Lemmatizer()
	lemmatised_list = []

	for i,word in enumerate(phrase_list):
		if i:
			lemmatised_list.append(a.lemmatize(word) + " ")

	return Response(json.dumps(lemmatised_list))
Ejemplo n.º 4
0
class Preprocessing:
    def __init__(self):
        self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)
        self.punc_regex = re.compile(r'^[^0-9a-zA-Z]+$')

    def preprocess(self, text, raw=False, keep_stop_words=False):
        # Tokenize
        tokens = word_tokenize(text)

        if not raw:
            # Lemmatize
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]

            # Convert to lowercase
            tokens = [t.lower() for t in tokens]

        if not keep_stop_words:
            # Remove stopwords and punctuations
            tokens = [
                t for t in tokens
                if t not in stop_words_slovene and not self.punc_regex.match(t)
            ]

        return tokens
Ejemplo n.º 5
0
 def test_lemmatize(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize(str("hodimo"))
     self.assertEqual(str("hoditi"), lemmatized)
Ejemplo n.º 6
0
 def test_null(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize(None)
     self.assertEqual(None, lemmatized)
Ejemplo n.º 7
0
 def test_punctuation(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("!\"=`.,/:")
     self.assertEqual("!\"=`.,/:", lemmatized)
Ejemplo n.º 8
0
 def test_emptystring(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("")
     self.assertEqual("", lemmatized)
Ejemplo n.º 9
0
 def test_utf8lemmatize(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("čistijo")
     self.assertEqual("čistiti", lemmatized)
Ejemplo n.º 10
0
 def test_lemmatize(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize(str("hodimo"))
     self.assertEqual(str("hoditi"), lemmatized)
Ejemplo n.º 11
0
 def test_punctuation(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("!\"=`.,/:")
     self.assertEqual("!\"=`.,/:", lemmatized)
Ejemplo n.º 12
0
 def test_emptystring(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("")
     self.assertEqual("", lemmatized)
Ejemplo n.º 13
0
 def test_utf8lemmatize(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize("čistijo")
     self.assertEqual("čistiti", lemmatized)
Ejemplo n.º 14
0
class Lemmatization():
    def __init__(self):
        self.lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)

    def lemmatize(self, token):
          return self.lemmatizer.lemmatize(token)
Ejemplo n.º 15
0
import sys

sys.path.append("C:/Users/dis/Documents/JanJezersek/EkoSmart/pylemmagen")

from lemmagen.lemmatizer import Lemmatizer

a = Lemmatizer()

for i, word in enumerate(sys.argv):
    if i:
        sys.stdout.write(a.lemmatize(word) + " ")
Ejemplo n.º 16
0
 def test_null(self):
     lemmatizer = Lemmatizer()
     lemmatized = lemmatizer.lemmatize(None)
     self.assertEqual(None, lemmatized)
Ejemplo n.º 17
0
class Tokenizer(object):
    def __init__(self, lemmatize=True):
        self.debug = False
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.estLemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_ESTONIAN)
        self.lemmatize = lemmatize
        self.stopwords = self.get_stopwords()

    def get_stopwords(self):
        sw = StopWord()
        return set(sw.words)

    def lemstem(self, token):
        if self.lemmatize:
            return self.lemmatizer.lemmatize(token)
        else:
            return self.stemmer.stem(token)

    def extractTokens(self, text):
        try:
            tokens = word_tokenize(text)
        except UnicodeEncodeError:
            tokens = []

        if not tokens:
            return {}

        est_text = self.is_estonian(text)

        token_dict = {}
        for token in tokens:
            token = token.lower()

            # check if string consists of alphabetic characters only
            if not (token.isalpha() and len(token) > 2):
                continue

            try:
                if est_text:
                    lemstem_word = self.estLemmatizer.lemmatize(token)
                else:
                    lemstem_word = self.lemstem(token)
            except Exception:
                lemstem_word = token

            if lemstem_word not in self.stopwords:
                if self.debug:
                    print "{0}: {1}".format(token.encode('utf-8'), lemstem_word.encode('utf-8'))
                if token_dict.has_key(lemstem_word):
                    token_dict[lemstem_word] += 1
                else:
                    token_dict[lemstem_word] = 1

        return token_dict

    def is_estonian(self, text):
        est = False
        try:
            est = detect(text) == 'et'
        except Exception:
            pass
        return est

    def getLectureRecord(self, lectureId):
        try:
            data = Lecture.select().where(Lecture.id == lectureId).get()
            return data
        except Exception:
            return None

    def extractLectureTokens(self, lecture):
        if lecture is None:
            return False

        text = lecture.content
        tokens = self.extractTokens(text)
        sorted_tokens = sorted(tokens.items(), key=operator.itemgetter(1))

        for token in sorted_tokens:
            try:
                with db.transaction() as txn:
                    LectureWord.create(
                        lecture=lecture,
                        word=token[0],
                        count=token[1],
                        active=True,
                        weight=0
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for lecture {0}, word {1}, {2}".format(lecture.id, token[0], e)

            if self.debug:
                print token

        return True

    def getCourseRecord(self, courseId):
        try:
            data = Course.select().where(Course.id == courseId).get()
            return data
        except Exception:
            return None

    def getLectures(self, course):
        lectures = Lecture.select().where(Lecture.course == course)
        return list(lectures)

    def extractCourseTokens(self, lectures):
        print "Lecture count: {0}".format(len(lectures))
        for lecture in lectures:
            print "Lecture: {0}".format(lecture.id)
            self.extractLectureTokens(lecture)

    def getCourses(self, courseId=0):
        if courseId:
            courses = Course.select().where(Course.id == courseId)
        else:
            courses = Course.select()
        return list(courses)

    def extractAllCourseTokens(self):
        for course in self.getCourses():
            print course.id, course.name
            lectures = self.getLectures(course)
            self.extractCourseTokens(lectures)

    def getLectureWords(self, lecture):
        lectureWords = list(LectureWord.select().where(LectureWord.lecture == lecture))
        return lectureWords

    def createCourseTokens(self):
        for course in self.getCourses():
            print "{}: {}".format(course.id, course.name.encode('utf8'))
            token_dict = {}
            lecture_token = {}

            for lecture in self.getLectures(course):
                lectureWords = self.getLectureWords(lecture)
                for lectureWord in lectureWords:
                    if not token_dict.has_key(lectureWord.word):
                        token_dict[lectureWord.word] = 0
                        lecture_token[lectureWord.word] = 0

                    token_dict[lectureWord.word] += lectureWord.count
                    lecture_token[lectureWord.word] += 1
            sorted_tokens = sorted(token_dict.items(), key=operator.itemgetter(1))
            for token in sorted_tokens:
                try:
                    with db.transaction() as txn:
                        CourseWord.create(
                            course=course,
                            word=token[0],
                            count=token[1],
                            active=True,
                            lectures=lecture_token[token[0]]
                        )
                        txn.commit()
                except peewee.OperationalError as e:
                    print "Could not create a record for course {0}, word {1}, {2}".format(course.name.encode('utf8'),
                                                                                           token[0].encode('utf8'), e)

    def getCourseWords(self, courseId=0):
        if courseId == 0:
            courseWords = CourseWord.select()
        else:
            courseWords = CourseWord.select().where(CourseWord.course == courseId)
        return list(courseWords)

    def createCorpusTokens(self):
        token_dict = {}
        for courseWord in self.getCourseWords():
            if token_dict.has_key(courseWord.word):
                token_dict[courseWord.word] += courseWord.count
            else:
                token_dict[courseWord.word] = courseWord.count

        sorted_tokens = sorted(token_dict.items(), key=operator.itemgetter(1))
        for token in sorted_tokens:
            print token
            try:
                with db.transaction() as txn:
                    CorpusWord.create(
                        word=token[0],
                        count=token[1],
                        active=True
                    )
                    txn.commit()
            except peewee.OperationalError as e:
                print "Could not create a record for word {}, {}".format(token[0], e)

    def calc_tf(self):
        for course in self.getCourses(55):
            print course.name
            for lecture in self.getLectures(course):
                maxCount = 0
                for lectureWord in self.getLectureWords(lecture):
                    maxCount = max(maxCount, lectureWord.count)

                for lectureWord in self.getLectureWords(lecture):
                    try:
                        with db.transaction():
                            lectureWord.weight = 0.5 + (0.5 * lectureWord.count) / maxCount
                            lectureWord.save()
                    except peewee.OperationalError as e:
                        print e