Python extract_words Examples, pymorphy.contrib.tokenizers.extract_words Python Examples

Example #1

0

Show file

File: 0008_fill_map_of_normalized_words.py Project: Shmelnick/lyrics

    def forwards(self, orm):
        "Write your forwards methods here."
        for song in Song.objects.all():
            to_write = list()
            try:
                for i, word in enumerate(extract_words(song.lyrics)):
                    for term in morph.normalize(word.upper()):
                        to_write.append('1 ' + str(i) + " " + term)
            except TypeError:
                pass
            try:
                for i, word in enumerate(extract_words(song.artist)):
                    for term in morph.normalize(word.upper()):
                        to_write.append('2 ' + str(i) + " " + term)
            except TypeError:
                pass
            try:
                for i, word in enumerate(extract_words(song.title)):
                    for term in morph.normalize(word.upper()):
                        to_write.append('3 ' + str(i) + " " + term)
            except TypeError:
                pass
            try:
                for i, word in enumerate(extract_words(song.linked_movie)):
                    for term in morph.normalize(word.upper()):
                        to_write.append('4 ' + str(i) + " " + term)
            except TypeError:
                pass

            song.map_of_normalized_words = "|".join(to_write)
            song.save(update_fields=["map_of_normalized_words"])
            print "Done", song.id

Example #2

0

Show file

File: feature_evaluator.py Project: achugr/kashpirovsky

    def eval_features(self, number, clazz, text):
        print text

        features = {}
        words_count = 0
        average_length = 0
        latin_words = 0
        for word in tokenizers.extract_words(text):
            words_count += 1
            average_length += len(word)
            if check_latin_word(word):
                latin_words += 1

        sentences = parser.analyze_paragraph(text)

        features['sentences_count'] = len(sentences)
        average_sentence_len = 0
        for sentence in sentences:
            average_sentence_len += len(sentence)

        features['average_sentence_length'] = average_sentence_len / len(sentences)

        excl_marks = "!" in text

        features['words_count'] = words_count
        features['average_length'] = average_length / words_count
        features['class'] = clazz
        features['latin_words'] = latin_words
        features['excl_marks'] = excl_marks
        features['quotes_count'] = len(self.quotes_pattern.findall(text))
        features['digits_count'] = len(self.digit_patter.findall(text))

        self.files_features[number] = features

Example #3

0

Show file

    def learn(self, class_name):
        self.classes.add(class_name)
        print class_name
        self.words_freq[class_name] = {}
        if class_name is "internet":
            dir_name = learn_internet
        else:
            dir_name = learn_nointernet

        for file_name in os.listdir(dir_name):
            print "processing", file_name
            text = open(dir_name + "/" + file_name, "r").read().decode("utf-8")
            words = [word.lower() for word in tokenizers.extract_words(text)]
            self.docs_number += 1
            self.unique_words_set = self.unique_words_set | set(words)
            stemmer = RussianStemmer()
            for word in words:
                stemmed = stemmer.stem(word)
                if stemmed in self.words_freq[class_name]:
                    self.words_freq[class_name][stemmed] += 1
                else:
                    self.words_freq[class_name][stemmed] = 1

            if class_name in self.words_in_class:
                self.words_in_class[class_name] += len(words)
                self.docs_in_class[class_name] += 1
            else:
                self.words_in_class[class_name] = len(words)
                self.docs_in_class[class_name] = 1

Example #4

0

Show file

File: tokenizers.py Project: bratao/pymorphy

 def test_exctract_words(self):
     txt = """Это  отразилось: на количественном,и на качествен_ном
             - росте карельско-финляндского сотрудничества - офигеть! кони+лошади=масло.
             -сказал кто-то --нет--"""
     words = list(extract_words(txt))
     self.assertListEqual(
         words,
         [
             "Это",
             "отразилось",
             "на",
             "количественном",
             "и",
             "на",
             "качествен_ном",
             "росте",
             "карельско-финляндского",
             "сотрудничества",
             "офигеть",
             "кони",
             "лошади",
             "масло",
             "сказал",
             "кто-то",
             "нет",
         ],
     )

Example #5

0

Show file

File: naive_bayes.py Project: alexeyev/kashpirovsky

    def learn(self, class_name):
        self.classes.add(class_name)
        print class_name
        self.words_freq[class_name] = {}
        if class_name is "internet":
            dir_name = learn_internet
        else:
            dir_name = learn_nointernet

        for file_name in os.listdir(dir_name):
            print "processing", file_name
            text = open(dir_name + "/" + file_name, "r").read().decode("utf-8")
            words = [word.lower() for word in tokenizers.extract_words(text)]
            self.docs_number += 1
            self.unique_words_set = self.unique_words_set | set(words)
            stemmer = RussianStemmer()
            for word in words:
                stemmed = stemmer.stem(word)
                if stemmed in self.words_freq[class_name]:
                    self.words_freq[class_name][stemmed] += 1
                else:
                    self.words_freq[class_name][stemmed] = 1

            if class_name in self.words_in_class:
                self.words_in_class[class_name] += len(words)
                self.docs_in_class[class_name] += 1
            else:
                self.words_in_class[class_name] = len(words)
                self.docs_in_class[class_name] = 1

Example #6

0

Show file

File: 0003_fill_db.py Project: eanikolaev/lyrics

    def forwards(self, orm):
        "Write your forwards methods here."
        all_songs_in_db = Song.objects.all()

        ids_of_songs = set()
        for rec in all_songs_in_db:
            song_id = rec.aid
            ids_of_songs.add(int(song_id))

        csvfile1 = open(FILE_WITH_SONGS_INFO, 'rb')
        reader1 = csv.reader(csvfile1, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)

        # Дополняем базу новыми записями
        added_songs = 0
        added_terms = 0
        recognized_tokens = 0
        for row1 in reader1:

            # [self.aid, self.artist, self.title, self.duration, self.lyrics, self.url]
            aid, artist, title, duration, lyrics_id, url = row1[0:6]

            csvfile2 = open(FILE_WITH_TEXTS_OF_SONGS, 'rb')
            reader2 = csv.reader(csvfile2, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            for row2 in reader2:
                # [id_of_song, flag, text]
                lyrics_id2, flag, text = row2[0:3]

                if lyrics_id == lyrics_id2 and flag == 'russian' and int(aid) not in ids_of_songs:

                    # Не встреченная ранее русская песня - заносим в базу
                    s = Song(aid=int(aid), artist=artist, title=title, duration=duration, url=url, lyrics=text)
                    s.save()

                    # Теперь вытащим все токены из текста
                    for word in tokenizers.extract_words(text.decode('utf8')):
                        recognized_tokens += 1
                        for term in morph.normalize(word.upper()):
                            # Берем все варианты нормализации - т.к. могут быть омонимы
                            try:
                                w = IndexElement.objects.get(term=term)     # Слово уже было в обратном индексе
                            except IndexElement.DoesNotExist:
                                w = IndexElement(term=term)
                                w.save()
                            except IndexElement.MultipleObjectsReturned:
                                print "WTF"
                                return
                            w.song.add(s)   # Приписали новую ссылку на слово
                            added_terms += 1

                    added_songs += 1
                    ids_of_songs.add(int(aid))
                    break

            csvfile2.close()

        csvfile1.close()
        print "Added songs", added_songs
        print "Found words", recognized_tokens
        print "Added terms", added_terms

Example #7

0

Show file

File: tokenizers.py Project: Qolt/search_engine

 def test_exctract_words(self):
     txt = u'''Это  отразилось: на количественном,и на качествен_ном
             - росте карельско-финляндского сотрудничества - офигеть! кони+лошади=масло.
             -сказал кто-то --нет--'''
     words = list(extract_words(txt))
     self.assertListEqual(words, [
         u'Это', u'отразилось', u'на', u'количественном', u'и', u'на',
         u'качествен_ном', u'росте', u'карельско-финляндского',
         u'сотрудничества', u'офигеть', u'кони', u'лошади', u'масло',
         u'сказал', u'кто-то', u'нет',
     ])

Example #8

0

Show file

File: 0005_fill_db_pesnifilm.py Project: Shmelnick/lyrics

    def forwards(self, orm):
        "Write your forwards methods here."

        csvfile1 = open(FILE_WITH_PESNIFILM, 'rb')
        reader1 = csv.reader(csvfile1, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)

        # Дополняем базу новыми записями
        added_songs = 0
        added_terms = 0
        recognized_tokens = 0
        for row1 in reader1:
            print added_songs

            # [self.artist, self.title, self.duration, self.url, lyrics, movie]
            # duration везде 0
            artist, title, duration, url, lyrics, movie = row1[0:6]
            duration = int(duration)

            # Возможны коллизии по aid (наш диапазон [-10000 : -19999]) - заносим в базу
            s = Song(aid=-10000-added_songs, artist=artist, title=title, duration=duration,
                     url=url, lyrics=lyrics, linked_movie=movie)
            s.save()

            # В индекс не пишем лишнее из авторов
            ar = artist.replace("исполнение", "").replace("текст", "").replace("музыка", "").replace("слова", "")
            ar = ar.replace(" и ", " ").replace("автор", "")
            all_text = lyrics + " " + title + " " + movie + " " + ar

            print artist

            # Теперь вытащим все токены из текста
            for word in tokenizers.extract_words(all_text.decode('utf8')):
                recognized_tokens += 1
                for term in morph.normalize(word.upper()):
                    # Берем все варианты нормализации - т.к. могут быть омонимы
                    try:
                        w = IndexElement.objects.get(term=term)     # Слово уже было в обратном индексе
                    except IndexElement.DoesNotExist:
                        w = IndexElement(term=term)
                        w.save()
                    except IndexElement.MultipleObjectsReturned:
                        print "WTF"
                        return
                    w.song.add(s)   # Приписали новую ссылку на слово
                    added_terms += 1

            added_songs += 1

        csvfile1.close()
        print "Added songs", added_songs
        print "Found words", recognized_tokens
        print "Added terms", added_terms

Example #9

0

Show file

File: naive_bayes2.py Project: alexeyev/kashpirovsky

def classify(self, input):
    words = list()
    for word in tokenizers.extract_words(input):
        words.append(word)
    stemmed = [RussianStemmer().stem(word) for word in words]
    result = dict()
    for _class in self.classes:
        prob = log(float(self.docs_in_class[_class]) / self.docs_number)
        for word in stemmed:
            if word in self.words_freq[_class]:
                wordFreq = self.words_freq[_class][word]
            else:
                wordFreq = 0
            prob += log(float(wordFreq + 1) / float(len(self.unique_words_set) + self.words_in_class[_class]))
        result[_class] = prob
    return result

Example #10

0

Show file

File: naive_bayes2.py Project: asakasinsky/kashpirovsky

def classify(self, input):
    words = list()
    for word in tokenizers.extract_words(input):
        words.append(word)
    stemmed = [RussianStemmer().stem(word) for word in words]
    result = dict()
    for _class in self.classes:
        prob = log(float(self.docs_in_class[_class]) / self.docs_number)
        for word in stemmed:
            if word in self.words_freq[_class]:
                wordFreq = self.words_freq[_class][word]
            else:
                wordFreq = 0
            prob += log(
                float(wordFreq + 1) / float(
                    len(self.unique_words_set) + self.words_in_class[_class]))
        result[_class] = prob
    return result

Example #11

0

Show file

    def classify(self, text, biggest_class):
        pos_count = 0
        neg_count = 0
        for word in tokenizers.extract_words(text):
            stemmed_word = RussianStemmer().stem(word)
            if stemmed_word in self.negative_keywords:
                print "negative word " + stemmed_word + " found in text: " + text
                neg_count += 1
            elif stemmed_word in self.positive_keywords:
                print "positive word " + stemmed_word + " found in text: " + text
                pos_count += 1

        result = dict()
        result['pos'] = pos_count
        result['neg'] = neg_count
        if result['pos'] == result['neg']:
            result[biggest_class] += 1
        return result

Example #12

0

Show file

File: naive_bayes2.py Project: asakasinsky/kashpirovsky

    def learn(self, class_name):
        dir_name = "."
        file_name = "tweets_by_trend.xml"

        self.classes.add(class_name)
        self.words_freq[class_name] = {}

        if class_name is "negative":
            code = 0
        else:
            code = 1

        print "processing", file_name

        tree = ET.parse(dir_name + "/" + file_name)
        root = tree.getroot()
        for tweet in root.findall('tweet'):
            sent = int(tweet.find('sent').text)
            if sent == code:
                text = tweet.find('text').text
                words = [
                    word.lower() for word in tokenizers.extract_words(text)
                ]
                self.docs_number += 1
                self.unique_words_set = self.unique_words_set | set(words)
                stemmer = RussianStemmer()
                for word in words:
                    stemmed = stemmer.stem(word)
                    if stemmed in self.words_freq[class_name]:
                        self.words_freq[class_name][stemmed] += 1
                    else:
                        self.words_freq[class_name][stemmed] = 1

                    if class_name in self.words_in_class:
                        self.words_in_class[class_name] += len(words)
                        self.docs_in_class[class_name] += 1
                    else:
                        self.words_in_class[class_name] = len(words)
                        self.docs_in_class[class_name] = 1

Example #13

0

Show file

File: naive_bayes2.py Project: alexeyev/kashpirovsky

    def learn(self, class_name):
        dir_name = "."
        file_name = "tweets_by_trend.xml"

        self.classes.add(class_name)
        self.words_freq[class_name] = {}

        if class_name is "negative":
            code = 0
        else:
            code = 1

        print "processing", file_name

        tree = ET.parse(dir_name + "/" + file_name)
        root = tree.getroot()
        for tweet in root.findall('tweet'):
            sent = int(tweet.find('sent').text)
            if sent == code:
                text = tweet.find('text').text
                words = [word.lower() for word in tokenizers.extract_words(text)]
                self.docs_number += 1
                self.unique_words_set = self.unique_words_set | set(words)
                stemmer = RussianStemmer()
                for word in words:
                    stemmed = stemmer.stem(word)
                    if stemmed in self.words_freq[class_name]:
                        self.words_freq[class_name][stemmed] += 1
                    else:
                        self.words_freq[class_name][stemmed] = 1

                    if class_name in self.words_in_class:
                        self.words_in_class[class_name] += len(words)
                        self.docs_in_class[class_name] += 1
                    else:
                        self.words_in_class[class_name] = len(words)
                        self.docs_in_class[class_name] = 1

Example #14

0

Show file

File: tokenizers.py Project: boogiiieee/Delo70

 def test_exctract_words(self):
     txt = u'''Это  отразилось: на количественном,и на качествен_ном
             - росте карельско-финляндского сотрудничества - офигеть! кони+лошади=масло.
             -сказал кто-то --нет--'''
     words = list(extract_words(txt))
     self.assertListEqual(words, [
         u'Это',
         u'отразилось',
         u'на',
         u'количественном',
         u'и',
         u'на',
         u'качествен_ном',
         u'росте',
         u'карельско-финляндского',
         u'сотрудничества',
         u'офигеть',
         u'кони',
         u'лошади',
         u'масло',
         u'сказал',
         u'кто-то',
         u'нет',
     ])

Example #15

0

Show file

# coding=utf-8

__author__ = 'artemii'
from pymorphy import get_morph
from pymorphy.contrib import tokenizers

f = open('negative_words.txt', 'r')
resultFile = open('negative_words_normalized.txt', 'a')
morph = get_morph('.')

#normalized = morph.normalize('тнрнюооюпюрю'.decode("utf-8"))
#print normalized.pop().lower().encode("utf-8")

for line in f:
    #    word = raw_input()
    words = tokenizers.extract_words(line.decode("utf-8"))
    word = words.next()
    normalized = morph.normalize(word.upper())
    resultFile.write(normalized.pop().lower().encode("utf-8") + '\n')
#    print normalized.pop().lower()

# for word pairs
#for line in f :
##    word = raw_input()
#    words = tokenizers.extract_words(line.decode("utf-8"))
#    normalized_fst = morph.normalize(words.next().upper())
#    normalized_snd = morph.normalize(words.next().upper())
#    resultFile.write(normalized_fst.pop().lower().encode("utf-8") + ' ' + normalized_snd.pop().lower().encode("utf-8") + '\n')

Example #16

0

Show file

File: views.py Project: kyromen/FED

def home(request):
    if request.method == 'GET' and 'search' in request.GET:
        district = ''
        regions = []
        subways = []

        films = []
        genres = []

        subjects = []

        day = datetime.now()

        string = request.GET['search']
        strings = re.split(u'[\.]+|!|\?', string)
        words = []

        for string in strings:
            for word in tokenizers.extract_words(string):
                words.append(word.upper())
        for word in words:
            word = word.upper()
            info = morph.get_graminfo(word)

            if info[0]['class'] == 'С':
                if check_object(District, {'name': word.lower()}):
                    district = word.lower()
                if check_object(Region, {'name__regex': r'(^' + info[0]['norm'].lower() + ')'}) and len(
                        words) > words.index(word) + 1:
                    if search_by_touch(Region, info[0]['norm'], words[words.index(word) + 1]):
                        regions.append(search_by_touch(Region, info[0]['norm'], words[words.index(word) + 1]))
                elif check_object(Region, {'name__regex': r'(^' + info[0]['norm'].lower() + '$)'}):
                    regions.append(word.lower())
                if check_object(Subway, {'name__regex': r'(^' + word.lower() + ' )'}) and len(words) > words.index(
                        word) + 1:
                    query_str = word
                    for word_ in words[words.index(word) + 1:words.index(word) + 2]:
                        if check_object(Subway, {'name__regex': r'(^' + query_str.lower() + '$)'}):
                            subways.append(Subway.objects.get(name=query_str.lower()))
                            break
                        query_str += ' ' + word_
                elif check_object(Subway, {'name__regex': r'(^' + word.lower() + '$)'}):
                    subways.append(Subway.objects.get(name=word.lower()))
                if check_object(Subject, {'name': info[0]['norm'].lower()}):
                    subjects.append(Subject.objects.get(name=info[0]['norm'].lower()))
                if check_object(Genre, {'name': info[0]['norm'].lower()}):
                    genres.append(Genre.objects.get(name=info[0]['norm'].lower()))
            elif info[0]['class'] == 'П':
                if len(words) > words.index(word) + 1:
                    if search_by_touch(Region, word, words[words.index(word) + 1]):
                        regions.append(search_by_touch(Region, word, words[words.index(word) + 1]))
                    if check_object(Subway, {'name__regex': r'(^' + word.lower() + ' )'}) and len(
                            words) > words.index(word) + 1:
                        query_str = word
                        for word_ in words[words.index(word) + 1:words.index(word) + 3]:
                            if check_object(Subway, {'name__regex': r'(^' + query_str.lower() + '$)'}):
                                subways.append(Subway.objects.get(name=query_str.lower()))
                                break
                            query_str += ' ' + word_
                elif check_object(Region, {'name__regex': r'(^' + info[0]['norm'].lower() + '$)'}):
                    regions.append(word.lower())
                elif check_object(Subway, {'name__regex': r'(^' + word.lower() + '$)'}):
                    subways.append(Subway.objects.get(name=word.lower()))

        events = []
        cinemas = []

        for film in Film.objects.filter(genres__in=genres):
            films.append(film.name)

        if district != '':
            district = District.objects.get(name=district)
            for region in district.region.all():
                regions.append(region.name)
        for name in regions:
            region = Region.objects.get(name=name)
            for subway in region.subways.all():
                subways.append(subway)

        for subway in subways:
            filter_parameters = {'place__geo__x__gt': subway.geo.x - 0.005,
                                 'place__geo__x__lt': subway.geo.x + 0.005,
                                 'place__geo__y__gt': subway.geo.y - 0.005,
                                 'place__geo__y__lt': subway.geo.y + 0.005}

            output = Event.objects.filter(**filter_parameters)
            for event in output:
                events.append(event)

            if films:
                filter_parameters['films__film__name__in'] = films

            output = Cinema.objects.filter(**filter_parameters)
            for cinema in output:
                if cinema not in cinemas:
                    cinemas.append(cinema)

        if cinemas.__len__() == 0:
            output = Cinema.objects.filter(films__film__name__in=films)
            for cinema in output:
                if cinema not in cinemas:
                    cinemas.append(cinema)

        if events and subjects:
            output = events
            events = []
            for i in range(output.__len__() - 1, -1, -1):
                for subject in output[i].subjects.all():
                    if subject in subjects:
                        events.append(output[i])
                        break
        elif subjects:
            for event in Event.objects.filter(subjects__in=subjects):
                events.append(event)

        places = []
        for i in range(events.__len__() - 1, -1, -1):
            if events[i].place not in places:
                places.append(events[i].place)
            else:
                events.pop(i)

        t = loader.get_template('home.html')
        c = RequestContext(request, {'events': events, 'cinemas': cinemas})
        return HttpResponse(t.render(c))

    t = loader.get_template('home.html')
    c = RequestContext(request, {'objects': {}})
    return HttpResponse(t.render(c))

Example #17

0

Show file

File: synonym.py Project: dmg61/Synonym

initialDict()

morph = get_morph('/home/bliq/PycharmProjects/Dictionary') # Подключаем словари

morth2 = pymorphy2.MorphAnalyzer()

searchWord = ""
synonym = ""
result = ""

text = raw_input("Введите текст:\n")
uni = unicode(text, "UTF-8") # Перевод строку в Unicode

# Разбиваем исходную строку на подстроки
listTokens = tokenizers.extract_tokens(uni)
listWords = tokenizers.extract_words(uni)

dic = {}

for word in listWords:
    info = morph.normalize(word.upper())
    info = list(info)[0]

    dic[info] = dic[info] + 1 if dic.has_key(info) else 1

dic = sorted(dic.items(), key = lambda elem: elem[1], reverse = True)

for word in dic:
    info = morph.get_graminfo(word[0])

    # if info[0]['class'] != u'СОЮЗ' \

Example #18

0

Show file

File: preprocess.py Project: alexeyev/kashpirovsky

def to_seq(input):
    words = list()
    for word in tokenizers.extract_words(input):
        words.append(word)
    stemmed = [RussianStemmer().stem(word) for word in words]
    return " ".join(stemmed)

Example #19

0

Show file

File: language_utils.py Project: gisly/patentsearch

def tokenizeText(text):
    return [word for word in tokenizers.extract_words(text)]

Example #20

0

Show file

File: views.py Project: Shmelnick/lyrics

def song_list(request):    
    start_time = time.time()
    query = request.GET.get('query','')
    if query:

        status, res = search(query.replace("L", "AND").replace(" X", " AND").replace("X", "").encode('utf-8'))
        if status == OK:
            song_list = Song.objects.filter(id__in=res)

            list_of_links = list()
            l_ew = list()

            ended_row_with_dependencies = False
            for i in extract_words(query.encode('utf-8').decode('utf-8').lstrip().rstrip()):
                if i not in ["AND", "OR", "NOT"] and "X" not in i and "L" not in i:
                    l_ew.append(i)
                elif i == "NOT":
                    ended_row_with_dependencies = True
                elif i == "AND" and not ended_row_with_dependencies:
                    list_of_links.append(0)
                elif "X" in i and not ended_row_with_dependencies:
                    list_of_links.append(len(i))
                elif "L" in i and not ended_row_with_dependencies:
                    list_of_links.append(100500)

            list_of_normalized_query_words = [0]*len(l_ew)
            for i, ew in enumerate(l_ew):
                list_of_normalized_query_words[i] = IndexElement.objects.filter(term__in=morph.normalize(ew.upper())).select_related("synonyms", "song")
                if list_of_normalized_query_words[i] is None:
                    print "AAAAAAA"

            # Range here
            for s in song_list:
                l_of_clear_repeats = [0]*len(l_ew)
                l_of_normalized_repeats = [0]*len(l_ew)
                l_of_synonym_repeats = [0]*len(l_ew)
                l_of_tf_idfs = [0]*len(l_ew)
                l_of_positions = [set()]*len(l_ew)
                set_of_highlights = set()                                                # TODO highlight it

                # clear repeats
                low = list(extract_words(s.lyrics))
                i = 0
                for i, w in enumerate(low):
                    w = w.upper()
                    for ii, ew in enumerate(l_ew):
                        if ew.upper() == w:
                            l_of_clear_repeats[ii] += 1
                            set_of_highlights.add(i)
                            l_of_positions[ii].add(i)
                amount_of_words_in_song = i + 1

                for ii, ew in enumerate(l_ew):
                    #   ((^|\|)[^|]*tr($|\|))
                    ss = list()
                    syn_list = list()
                    freq = 0

                    for nor in list_of_normalized_query_words[ii]:
                        ss.append(nor.term)
                        freq += nor.get_linked_songs_amount()
                        for syn in nor.synonyms.all():
                            syn_list.append(syn.term)

                    idf_of_ew = log((100000. - freq + 0.5)/(freq + 0.5))

                    # normalized repeats
                    pat = re.compile("((^|\|)[^\|]*(" + "|".join(ss) + ")($|\|))")
                    print "NORM", "|".join(ss)
                    includes = re.findall(pat, s.map_of_normalized_words)

                    for el in includes:
                        seg, pos, word = el[0].strip("|").split(" ")
                        if seg == '1':
                            set_of_highlights.add(int(pos))
                            l_of_positions[ii].add(int(pos))
                        l_of_normalized_repeats[ii] += 1

                    # Synonyms repeats
                    pat = re.compile("((^|\|)[^\|]*(" + "|".join(syn_list) + ")($|\|))")
                    print "SL", "|".join(syn_list)
                    includes = re.findall(pat, s.map_of_normalized_words)

                    for el in includes:
                        seg, pos, word = el[0].strip("|").split(" ")
                        if seg == '1':
                            set_of_highlights.add(int(pos))
                            l_of_positions[ii].add(int(pos))
                        l_of_synonym_repeats[ii] += 1

                    weighted_tf = (0.6)*l_of_clear_repeats[ii] + (0.3)*l_of_normalized_repeats[ii] + (0.1)*l_of_synonym_repeats[ii]
                    l_of_tf_idfs[ii] = (weighted_tf / amount_of_words_in_song) * idf_of_ew

                range_by = sum(l_of_tf_idfs)                                                            # TODO

                # additional ranking

                all_pos = list()
                for brakes in itertools.product(*l_of_positions):
                    flag = False
                    for i in xrange(len(brakes)-1):
                        if brakes[i] >= brakes[i+1]:
                            flag = True
                            break
                    if not flag:
                        all_pos.append(brakes)

                all_pos2 = list()
                # neighbors through AND
                for b in all_pos:
                    ff = True
                    for i in xrange(len(l_ew)-1):
                        if list_of_links[i] == 0:
                            if b[i+1] != b[i]+1:
                                ff = False
                    if ff:
                        all_pos2.append(b)

                print "AND done", len(all_pos2)

                all_pos3 = list()
                # neighbors through XXX
                for b in all_pos2:
                    ff = True
                    for i in xrange(len(l_ew)-1):
                        if 0 < list_of_links[i] < 100500:
                            trashline = ""
                            for ww in low[b[i] + 1: b[i+1]]:
                                trashline += ww
                            trashline = trashline.upper()
                            am_of_gl = trashline.count("А".decode('utf-8')) + trashline.count("Е".decode('utf-8')) + \
                                       trashline.count("О".decode('utf-8')) + trashline.count("У".decode('utf-8')) + \
                                       trashline.count("Ы".decode('utf-8')) + trashline.count("Э".decode('utf-8')) + \
                                        trashline.count("Я".decode('utf-8')) + trashline.count("И".decode('utf-8')) + \
                                       trashline.count("Ю".decode('utf-8')) + trashline.count("Ё".decode('utf-8'))
                            if am_of_gl != list_of_links[i]:
                                ff = False
                    if ff:
                        all_pos3.append(b)

                for b in all_pos3:
                    print "XXX done", b

                if len(all_pos3) > 0:
                    range_by += 1.0

                print set_of_highlights, l_of_clear_repeats, l_of_normalized_repeats, l_of_synonym_repeats, range_by


            count = song_list.count()
        else:
            return error(request, res)
    else:
        song_list = []
        count = 0


    params_dict = {
        'song_list': song_list,
        'results_count': count,
        'elapsed_time': round_time(time.time() - start_time)
    }

    return render(request, 'song_list.html', params_dict)

Example #21

0

Show file

File: feature_evaluator.py Project: alexeyev/kashpirovsky

    def eval_features(self, number, clazz, text):
        features = {}
        words_count = 0
        average_length = 0
        latin_words = 0
        words = list()

        if clazz in self.classes_count:
            self.classes_count[clazz] += 1
        else:
            self.classes_count[clazz] = 0

        for word in tokenizers.extract_words(text):
            words.append(word)
            words_count += 1
            average_length += len(word)
            if check_latin_word(word):
                latin_words += 1

        sentences = parser.analyze_paragraph(text)

        features['sentences_count'] = len(sentences)
        average_sentence_len = 0
        for sentence in sentences:
            average_sentence_len += len(sentence)

        features['average_sentence_length'] = average_sentence_len / len(sentences)

        excl_marks = "!" in text

        features['words_count'] = words_count
        features['average_length'] = average_length / words_count
        features['class'] = clazz
        features['latin_words'] = latin_words
        features['excl_marks'] = excl_marks
        features['quotes_count'] = len(self.quotes_pattern.findall(text))
        features['digits_count'] = len(self.digit_patter.findall(text))

        # dict features
        features['auto_dict'] = self.count_dict_features("auto.txt", words)
        features['economics_dict'] = self.count_dict_features("economics.txt", words)
        features['hi_tech_dict'] = self.count_dict_features("hi_tech.txt", words)
        features['internet_dict'] = self.count_dict_features("internet.txt", words)
        features['kultura_dict'] = self.count_dict_features("kultura.txt", words)
        features['politics_dict'] = self.count_dict_features("politics.txt", words)
        features['science_dict'] = self.count_dict_features("science.txt", words)
        features['social_dict'] =self.count_dict_features("social.txt", words)
        features['sport_dict'] = self.count_dict_features("sport.txt", words)

        abbrevations_count = 0
        for token in text.split():
            if self.abbreviation_pattern.match(token):
                abbrevations_count += 1
                # print token
                # else:
                #     print "not match: ", token

        # features['abbrevation_count'] = abbrevations_count
        # if abbrevations_count>30:
        #     print number, " ", abbrevations_count
        # print features
        self.files_features[number] = features

Example #22

0

Show file

File: preprocess.py Project: asakasinsky/kashpirovsky

def to_seq(input):
    words = list()
    for word in tokenizers.extract_words(input):
        words.append(word)
    stemmed = [RussianStemmer().stem(word) for word in words]
    return " ".join(stemmed)

Example #23

0

Show file

File: pyMorphy.py Project: achugr/GoodsReview

# coding=utf-8

__author__ = "artemii"
from pymorphy import get_morph
from pymorphy.contrib import tokenizers

f = open("negative_words.txt", "r")
resultFile = open("negative_words_normalized.txt", "a")
morph = get_morph(".")

# normalized = morph.normalize('тнрнюооюпюрю'.decode("utf-8"))
# print normalized.pop().lower().encode("utf-8")

for line in f:
    #    word = raw_input()
    words = tokenizers.extract_words(line.decode("utf-8"))
    word = words.next()
    normalized = morph.normalize(word.upper())
    resultFile.write(normalized.pop().lower().encode("utf-8") + "\n")
#    print normalized.pop().lower()

# for word pairs
# for line in f :
##    word = raw_input()
#    words = tokenizers.extract_words(line.decode("utf-8"))
#    normalized_fst = morph.normalize(words.next().upper())
#    normalized_snd = morph.normalize(words.next().upper())
#    resultFile.write(normalized_fst.pop().lower().encode("utf-8") + ' ' + normalized_snd.pop().lower().encode("utf-8") + '\n')