Example #1
0
def text_cleaner(text):
    text = text.lower()  # приведение в lowercase,

    text = re.sub(r'https?://[\S]+', ' url ', text)  # замена интернет ссылок
    text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', text)

    text = re.sub(r'\d+[-/\.]\d+[-/\.]\d+', ' date ',
                  text)  # замена даты и времени
    text = re.sub(r'\d+ ?гг?', ' date ', text)
    text = re.sub(r'\d+:\d+(:\d+)?', ' time ', text)

    # text = re.sub( r'@\w+', ' tname ', text ) # замена имён twiter
    # text = re.sub( r'#\w+', ' htag ', text ) # замена хештегов

    text = re.sub(r'<[^>]*>', ' ', text)  # удаление html тагов
    text = re.sub(r'[\W]+', ' ', text)  # удаление лишних символов

    stemmer = Stemmer('russian')
    text = ' '.join(stemmer.stemWords(text.split()))

    stw = [
        'в', 'по', 'на', 'из', 'и', 'или', 'не', 'но', 'за', 'над', 'под',
        'то', 'a', 'at', 'on', 'of', 'and', 'or', 'in', 'for', 'at'
    ]
    remove = r'\b(' + '|'.join(stw) + ')\b'
    text = re.sub(remove, ' ', text)

    text = re.sub(r'\b\w\b', ' ', text)  # удаление отдельно стоящих букв

    text = re.sub(r'\b\d+\b', ' digit ', text)  # замена цифр

    return text
Example #2
0
def text_cleaner(text: str):
    text = text.lower()
    stemmer = Stemmer("russian")  # Выбор языка на котором будут входные данные
    text = " ".join(stemmer.stemWords(text.split()))
    text = re.sub(r"\b\d+\b", "digit",
                  text)  # По идее заменяет цифры ! (на что пока не понял)
    return text
def classif(text, mass, num_all_docs, num_words_unic):
    stm = Stemmer('russian')
    text = stm.stemWords(regexp_tokenize((text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*"))
    num_povt_words = 0
    summa = 0
    while_iter = 0
    while while_iter < len(mass):
        summand_1 = log((mass[while_iter].num_docs + 0.0) / (num_all_docs + 0.0) + 0.0, 1.1)
        for i in text:
            for i1 in mass[while_iter].lst_allword:
                if i == i1:
                    num_povt_words = num_povt_words + 1
            summand_2 = log(((num_povt_words + 1) + 0.0) / ((num_words_unic + mass[while_iter].num_words) + 0.0), 1.1)
            num_povt_words = 0
            summa = summa + summand_2
        mass[while_iter].c = summand_1 + summa
        summa = 0
        while_iter = while_iter + 1

    max_c = -100000
    while_iter = 0
    number_max = 0

    while while_iter < len(mass):
        print mass[while_iter].c
        if mass[while_iter].c > max_c:
            max_c = mass[while_iter].c
            number_max = while_iter
        while_iter = while_iter + 1
    print mass[number_max].name_categories
Example #4
0
class MyStemmer(object):

    def __init__(self, stemmer_type):
        self.stemmer = Stemmer(stemmer_type)

    def do_stemming(self, word_list):
        return self.stemmer.stemWords(word_list)
Example #5
0
class StemProvider(Provider):
    """Stem the input values (either a single word or a list of words)

    Uses the porter stemmer algorithm.
    """
    def __init__(self, language='english', **kwargs):
        """
        See here for a full list of languages:

            http://nltk.org/_modules/nltk/stem/snowball.html

        .. note::

            This does not depend on nltk, it depends on the ``pystemmer`` package.

        :param language: language to use during stemming, defaults to english.
        """
        Provider.__init__(self, **kwargs)
        self._stemmer = Stemmer(language)

    def do_process(self, input_value):
        if isinstance(input_value, str):
            return self._stemmer.stemWord(input_value)
        else:
            return self._stemmer.stemWords(input_value)
Example #6
0
 def _prepare_text(self, text):
     """Extracts and stems the words from some given text.
     """
     words = re.findall('[a-z0-9\']+', text.lower())
     words = [word for word in words if word not in STOP_WORDS]
     stemmer = Stemmer('english')
     stemmed_words = stemmer.stemWords(words)
     return stemmed_words
Example #7
0
 def _prepare_text(self, text):
     """Extracts and stems the words from some given text.
     """
     words = re.findall('[a-z0-9\']+', text.lower())
     words = [word for word in words if word not in STOP_WORDS]
     stemmer = Stemmer('english')
     stemmed_words = stemmer.stemWords(words)
     return stemmed_words
def train(name_file_dbase, way_to_dbase):
    stm = Stemmer('russian')
    file_base = open(name_file_dbase, 'r')
    Lines = file_base.readlines()
    num_all_docs = len(Lines) + 1

    mass = []
    iter1 = 0
    iter2 = 0

    for line in Lines:
        number1, address1 = unpack_line(line)
        number = number1.strip("\n")
        address = address1.strip("\n")
        if (number == "1"):
            mass.append(Categories())
            mass[iter1].name_categories = address1
            mass[iter1 - 1].num_docs = iter2
            iter1 = iter1 + 1
            iter2 = 0
        iter2 = iter2 + 1
    mass[len(mass) - 1].num_docs = iter2
    while_iter = 0

    file_base.close()
    number = 1

    while while_iter < len(mass):
        while number <= mass[while_iter].num_docs:
            file_forclass = open(way_to_dbase + mass[while_iter].name_categories
                                 + '/' + str(number) + 'forclass.txt', 'r')
            str_read = re.sub("^\s+|\n|\r|\s+$", ' ', file_forclass.read())
            mass[while_iter].line_allword = mass[while_iter].line_allword + str_read
            file_forclass.close()
            number = number + 1
        while_iter = while_iter + 1
        number = 1

    while_iter = 0

    while while_iter < len(mass):
        forstemmer = mass[while_iter].line_allword.decode('UTF-8')
        str_read = stm.stemWords(regexp_tokenize(forstemmer.lower(), r"(?x) \w+ | \w+(-\w+)*"))
        mass[while_iter].num_words = len(str_read)
        mass[while_iter].lst_allword = str_read
        lst_unic_words = list(set(mass[while_iter].lst_allword))
        mass[while_iter].num_wordsunic = len(lst_unic_words)
        while_iter = while_iter + 1

    all_words = 0
    num_words_unic = 0
    while_iter = 0

    while while_iter < len(mass):
        all_words = all_words + mass[while_iter].num_words
        num_words_unic = num_words_unic + mass[while_iter].num_wordsunic
        while_iter = while_iter + 1
    return mass, num_all_docs, num_words_unic
Example #9
0
    def get_search_phrases(self, indexing_func=None):
        """Returns search phrases from properties in a given Model instance.

        Args (optional):
            only_index: List of strings.  Restricts indexing to these property names.
            indexing_func: A function that returns a set of keywords or phrases.

        Note that the indexing_func can be passed in to allow more customized
        search phrase generation.

        Two model variables influence the output of this method:
            INDEX_ONLY: If None, all indexable properties are indexed.
                If a list of property names, only those properties are indexed.
            INDEX_MULTI_WORD: Class variable that allows multi-word search
                phrases like "statue of liberty."
            INDEX_STEMMING: Returns stemmed phrases.
        """
        if not indexing_func:
            klass = self.__class__
            if klass.INDEX_MULTI_WORD:
                indexing_func = klass.get_search_phraseset
            else:
                indexing_func = klass.get_simple_search_phraseset
        if self.INDEX_STEMMING:
            stemmer = Stemmer('english')
        phrases = set()

        # allow indexing of 'subentities' such as tasks of a list as well
        queries = [(self,self.INDEX_ONLY)] + self.INDEX_SUBENTITY_QUERIES
        import logging
        for query, props in queries:
            entities = []
            try:
                subentities = query(self).fetch(1000)
                # get all of them
                while len(subentities) > 0:
                    entities.extend(subentities)
                    last_key = subentities[-1].key()
                    subentities = query(self).order('__key__').filter('__key__ >',last_key).fetch(1000)
            except TypeError, e: # query is not callable because it's an actual entity
                entities = [query]
            for entity in entities:
                for prop_name, prop_value in entity.properties().iteritems():
                    if not props or prop_name in props:
                        values = prop_value.get_value_for_datastore(entity)
                        if not isinstance(values, list):
                            values = [values]
                        if (isinstance(values[0], basestring) and
                                not isinstance(values[0], datastore_types.Blob)):
                            for value in values:
                                words = indexing_func(value,add_stop_words=self.INDEX_ADD_STOP_WORDS)
                                if self.INDEX_STEMMING:
                                    stemmed_words = set(stemmer.stemWords(words))
                                    phrases.update(stemmed_words)
                                else:
                                    phrases.update(words)
def process_text(s):
    s = re.sub('<[^>]+>', '', s)
    s = re.sub('&.*?;', '', s)
    words = simple_preprocess(s, deacc=True, max_len=99)
    words = [word for word in words if word not in stoplist]
    stemmer = Stemmer('english')
    words = stemmer.stemWords(words)
    #print words
    #print stoplist
    #raw_input()
    return words
Example #11
0
class PyStemmerMixIn(AbstractLanguage, metaclass=abc.ABCMeta):
    """Language which is supported by "PyStemmer" Python module."""
    def __init__(self):
        """Constructor."""
        super().__init__()

        # PyStemmer instance (lazy initialized)
        self.__pystemmer = None

    def stem_words(self, words: List[str]) -> List[str]:
        """Stem list of words with PyStemmer."""
        language_code = self.language_code()
        words = decode_object_from_bytes_if_needed(words)

        # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in
        # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence
        # tokenization first)
        words = [word.replace("’", "'") for word in words]

        if language_code is None:
            raise McLanguageException("Language code is None.")

        if words is None:
            raise McLanguageException("Words to stem is None.")

        # (Re-)initialize stemmer if needed
        if self.__pystemmer is None:

            try:
                self.__pystemmer = PyStemmer(language_code)
            except Exception as ex:
                raise McLanguageException(
                    "Unable to initialize PyStemmer for language '%s': %s" % (
                        language_code,
                        str(ex),
                    ))

        stems = self.__pystemmer.stemWords(words)

        if len(words) != len(stems):
            log.warning(
                "Stem count is not the same as word count; words: %s; stems: %s"
                % (
                    str(words),
                    str(stems),
                ))

        # Perl's Snowball implementation used to return lowercase stems
        stems = [stem.lower() for stem in stems]

        return stems
Example #12
0
def stem_words(iterable, language='english'):
    """Stem every word in iterable.

    Uses PyStemmer which is based on the Porter Stemming algorithms -
    an algorithm for suffix stripping.

    https://tartarus.org/martin/PorterStemmer/def.txt

    :rtype: list.
    """
    try:
        stemmer = Stemmer(language)
    except KeyError:
        stemmer = Stemmer('english')
    return stemmer.stemWords(iterable)
Example #13
0
class PyStemmerMixIn(AbstractLanguage, metaclass=abc.ABCMeta):
    """Language which is supported by "PyStemmer" Python module."""

    def __init__(self):
        """Constructor."""
        super().__init__()

        # PyStemmer instance (lazy initialized)
        self.__pystemmer = None

    def stem_words(self, words: List[str]) -> List[str]:
        """Stem list of words with PyStemmer."""
        language_code = self.language_code()
        words = decode_object_from_bytes_if_needed(words)

        # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in
        # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence
        # tokenization first)
        words = [word.replace("’", "'") for word in words]

        if language_code is None:
            raise McLanguageException("Language code is None.")

        if words is None:
            raise McLanguageException("Words to stem is None.")

        # (Re-)initialize stemmer if needed
        if self.__pystemmer is None:

            try:
                self.__pystemmer = PyStemmer(language_code)
            except Exception as ex:
                raise McLanguageException(
                    "Unable to initialize PyStemmer for language '%s': %s" % (language_code, str(ex),)
                )

        stems = self.__pystemmer.stemWords(words)

        if len(words) != len(stems):
            log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),))

        # Perl's Snowball implementation used to return lowercase stems
        stems = [stem.lower() for stem in stems]

        return stems
Example #14
0
    def __text_cleaner_with_stemming(raw_text):
        """
        Using regexp to clean up the text
        Stemming the text

        :param raw_text: source text
        :return: clean text
        """

        raw_text = raw_text.lower()  # приведение в lowercase,

        raw_text = re.sub(r'https?://[\S]+', ' url ', raw_text)  # замена интернет ссылок
        raw_text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', raw_text)

        raw_text = re.sub(r'\d+[-/\.]\d+[-/\.]\d+', ' date ', raw_text)  # замена даты и времени
        raw_text = re.sub(r'\d+ ?гг?', ' date ', raw_text)
        raw_text = re.sub(r'\d+:\d+(:\d+)?', ' time ', raw_text)
        raw_text = re.sub(r'@\w+', ' tname ', raw_text)  # замена имён twiter
        raw_text = re.sub(r'#\w+', ' htag ', raw_text)  # замена хештегов

        raw_text = re.sub(r'<[^>]*>', ' ', raw_text)  # удаление html тагов
        raw_text = re.sub(r'[\W]+', ' ', raw_text)  # удаление лишних символов

        stemmer = Stemmer('russian')

        raw_text = ' '.join(stemmer.stemWords(raw_text.split()))

        stw = ['в', 'по', 'на', 'из', 'и', 'или', 'не', 'но', 'за', 'над', 'под', 'то',
               'a', 'at', 'on', 'of', 'and', 'or', 'in', 'for', 'at']
        remove = r'\b(' + '|'.join(stw) + ')\b'
        raw_text = re.sub(remove, ' ', raw_text)

        raw_text = re.sub(r'\b\w\b', ' ', raw_text)  # удаление отдельно стоящих букв

        raw_text = re.sub(r'\b\d+\b', ' digit ', raw_text)  # замена цифр

        return raw_text
Example #15
0
def text_cleaner(text):
    text = text.lower() # приведение в lowercase 
    stemmer = Stemmer('russian')
    text = ' '.join( stemmer.stemWords( text.split() ) ) 
    text = re.sub( r'\b\d+\b', ' digit ', text ) # замена цифр 
    return  text 
Example #16
0
class CleanTextUtil:
    """ Utility for cleaning text by using stop words and stemming.
 
    Examples:
    >>> c = CleanTextUtil("french")
    >>> c.stem_words([u"Nous", u"allions", u"à", u"la", u"plage"])
    [u'Nous', u'allion', u'à', u'la', u'plag']
    >>> c.rm_stop_words([u"Nous", u"allions", u"à", u"la", u"plage"])
    [u'Nous', u'allions', u'plage']
    >>> c.clean_text(u"Nous allions à la plage")
    [u'allion', u'plag']

    Attributes:
        stemmer (Stemmer.Stemmer): The stemmer delegate object.
        stopwords (list of str): A list of stopwords.

    """
    def __init__(self, language):
        """ Initializes attributes with the language provided.

        Args:
            language (str): The language used to stem ('french', 'english').

        """
        self.stemmer = Stemmer(language)
        self.stopwords = stopwords.words(language)
    
    def stem_words(self, words):
        """ Stems a list of words.

        Args:
            words (list of str): A list of words.

        Returns:
            list of str: The list updated with stem words.

        """
        return self.stemmer.stemWords(words)
    
    def rm_stop_words(self, words):
        """ Removes stop words from a list of words.

        Args:
            words (list of str): A list of words.

        Returns:
            list of str: The list minus the stop words.

        """
        return [word for word in words if word.lower() not in self.stopwords]

    def clean_text(self, text):
        """ Cleans a text to optimize search engines. 
        
        Step of the cleaning: 
            1. Transform all characters to lowercase letters.
            2. Find all word with the regular expression "\w+".
            3. Remove stop words with a filter.
            4. Stem the rest of words.

        Args:
            text (str): A text.

        Returns:
            list of str: The list of words transformed.

        """
        words = SPLIT_TEXT.findall(text.lower())
        words = self.rm_stop_words(words)
        words = self.stem_words(words)
        return words
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
import sys

from Stemmer import Stemmer
from nltk import regexp_tokenize

directory = sys.argv[1];

files = os.listdir(directory);

text_file = filter(lambda x: x.endswith('.txt'), files);

all_text = ""

for i in text_file:
    try:
        f = open(i, 'r')
        all_text = all_text + f.read()
    except:
        print i

stm = Stemmer('russian')
text = stm.stemWords(regexp_tokenize((all_text.decode('UTF-8')).lower(), r"(?x) \w+ | \w+(-\w+)*"))
for i in text:
    num = text.count(i)
    print i.encode('UTF - 8'), " ", num
Example #18
0
    def full_text_search(phrase,
                         limit=10,
                         kind=None,
                         stemming=INDEX_STEMMING,
                         multi_word_literal=INDEX_MULTI_WORD,
                         add_stop_words=frozenset([])):
        """Queries search indices for phrases using a merge-join.

        Args:
            phrase: String.  Search phrase.
            kind: String.  Returned keys/entities are restricted to this kind.

        Returns:
            A list of (key, title) tuples corresponding to the indexed entities.
            Multi-word literal matches are returned first.

        TODO -- Should provide feedback if input search phrase has stop words, etc.
        """
        index_keys = []
        keywords = unidecode(PUNCTUATION_REGEX.sub(' ',
                                                   phrase)).lower().split()

        if stemming:
            stemmer = Stemmer('english')
            klass = StemmedIndex
        else:
            klass = LiteralIndex

        #logging.warning(keywords)
        current_user = users.get_current_user()
        if len(keywords) > 1 and multi_word_literal:
            # Try to match literal multi-word phrases first
            if len(keywords) == 2:
                search_phrases = [' '.join(keywords)]
            else:
                search_phrases = []
                sub_strings = len(keywords) - 2
                keyword_not_stop_word = map(
                    lambda x: x not in STOP_WORDS and x not in add_stop_words,
                    keywords)
                for pos in xrange(0, sub_strings):
                    if keyword_not_stop_word[pos] and keyword_not_stop_word[
                            pos + 2]:
                        search_phrases.append(' '.join(keywords[pos:pos + 3]))

            for phrase in search_phrases:
                #logging.warning(phrase)
                if stemming:
                    phrase = ' '.join(stemmer.stemWords(phrase.split()))
                if current_user:
                    query = klass.all(keys_only=True).filter(
                        'phrases =',
                        phrase).filter('view_permissions =',
                                       current_user.user_id()).order(
                                           'ordinal')  #.order('-rating')
                pub_query = klass.all(keys_only=True).filter(
                    'phrases =',
                    phrase).filter('view_permissions =', 'public').order(
                        'ordinal')  #.order('-rating')
                if kind:
                    if current_user:
                        query = query.filter('parent_kind =', kind)
                    pub_query = pub_query.filter('parent_kind =', kind)
                if current_user:
                    index_keys.extend([
                        key
                        for key in query.fetch(limit=limit - len(index_keys))
                        if key not in index_keys
                    ])
                index_keys.extend([
                    key
                    for key in pub_query.fetch(limit=limit - len(index_keys))
                    if key not in index_keys
                ])

        if len(index_keys) < limit:
            new_limit = limit - len(index_keys)
            keywords = filter(lambda x: len(x) >= SEARCH_PHRASE_MIN_LENGTH,
                              keywords)
            if stemming:
                keywords = stemmer.stemWords(keywords)
            for keyword in keywords:
                if current_user:
                    query = klass.all(keys_only=True).filter(
                        'phrases =',
                        keyword).filter('view_permissions =',
                                        current_user.user_id()).order(
                                            'ordinal')  #.order('-rating')
                pub_query = klass.all(keys_only=True).filter(
                    'phrases =',
                    keyword).filter('view_permissions =', 'public').order(
                        'ordinal')  #.order('-rating')
                if kind:
                    if current_user:
                        query = query.filter('parent_kind =', kind)
                    pub_query = pub_query.filter('parent_kind =', kind)
                if current_user:
                    index_keys.extend([
                        key
                        for key in query.fetch(limit=limit - len(index_keys))
                        if key not in index_keys
                    ])
                index_keys.extend([
                    key
                    for key in pub_query.fetch(limit=limit - len(index_keys))
                    if key not in index_keys
                ])

        return [(key.parent(), SearchIndex.get_title(key.name()))
                for key in index_keys]
Example #19
0
    def get_search_phrases(self, indexing_func=None):
        """Returns search phrases from properties in a given Model instance.

        Args (optional):
            only_index: List of strings.  Restricts indexing to these property names.
            indexing_func: A function that returns a set of keywords or phrases.

        Note that the indexing_func can be passed in to allow more customized
        search phrase generation.

        Two model variables influence the output of this method:
            INDEX_ONLY: If None, all indexable properties are indexed.
                If a list of property names, only those properties are indexed.
            INDEX_MULTI_WORD: Class variable that allows multi-word search
                phrases like "statue of liberty."
            INDEX_STEMMING: Returns stemmed phrases.
        """
        if not indexing_func:
            klass = self.__class__
            if klass.INDEX_MULTI_WORD:
                indexing_func = klass.get_search_phraseset
            else:
                indexing_func = klass.get_simple_search_phraseset
        if self.INDEX_STEMMING:
            stemmer = Stemmer('english')
        phrases = set()

        # allow indexing of 'subentities' such as tasks of a list as well
        queries = [(self, self.INDEX_ONLY)] + self.INDEX_SUBENTITY_QUERIES
        import logging
        for query, props in queries:
            entities = []
            try:
                subentities = query(self).fetch(1000)
                # get all of them
                while len(subentities) > 0:
                    entities.extend(subentities)
                    last_key = subentities[-1].key()
                    subentities = query(self).order('__key__').filter(
                        '__key__ >', last_key).fetch(1000)
            except TypeError, e:  # query is not callable because it's an actual entity
                entities = [query]
            for entity in entities:
                for prop_name, prop_value in entity.properties().iteritems():
                    if not props or prop_name in props:
                        values = prop_value.get_value_for_datastore(entity)
                        if not isinstance(values, list):
                            values = [values]
                        if (isinstance(values[0], basestring)
                                and not isinstance(values[0],
                                                   datastore_types.Blob)):
                            for value in values:
                                words = indexing_func(
                                    value,
                                    add_stop_words=self.INDEX_ADD_STOP_WORDS)
                                if self.INDEX_STEMMING:
                                    stemmed_words = set(
                                        stemmer.stemWords(words))
                                    phrases.update(stemmed_words)
                                else:
                                    phrases.update(words)
Example #20
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
import sys

from Stemmer import Stemmer
from nltk import regexp_tokenize

directory = sys.argv[1]

files = os.listdir(directory)

text_file = filter(lambda x: x.endswith('.txt'), files)

all_text = ""

for i in text_file:
    try:
        f = open(i, 'r')
        all_text = all_text + f.read()
    except:
        print i

stm = Stemmer('russian')
text = stm.stemWords(
    regexp_tokenize((all_text.decode('UTF-8')).lower(),
                    r"(?x) \w+ | \w+(-\w+)*"))
for i in text:
    num = text.count(i)
    print i.encode('UTF - 8'), " ", num
Example #21
0
    def full_text_search(phrase, limit=10,
                         kind=None,
                         stemming=INDEX_STEMMING,
                         multi_word_literal=INDEX_MULTI_WORD,
                         add_stop_words=frozenset([])):
        """Queries search indices for phrases using a merge-join.

        Args:
            phrase: String.  Search phrase.
            kind: String.  Returned keys/entities are restricted to this kind.

        Returns:
            A list of (key, title) tuples corresponding to the indexed entities.
            Multi-word literal matches are returned first.

        TODO -- Should provide feedback if input search phrase has stop words, etc.
        """
        index_keys = []
        keywords = unidecode(PUNCTUATION_REGEX.sub(' ', phrase)).lower().split()

        if stemming:
            stemmer = Stemmer('english')
            klass = StemmedIndex
        else:
            klass = LiteralIndex

        #logging.warning(keywords)
        current_user = users.get_current_user()
        if len(keywords) > 1 and multi_word_literal:
            # Try to match literal multi-word phrases first
            if len(keywords) == 2:
                search_phrases = [' '.join(keywords)]
            else:
                search_phrases = []
                sub_strings = len(keywords) - 2
                keyword_not_stop_word = map(lambda x: x not in STOP_WORDS and x not in add_stop_words, keywords)
                for pos in xrange(0, sub_strings):
                    if keyword_not_stop_word[pos] and keyword_not_stop_word[pos+2]:
                        search_phrases.append(' '.join(keywords[pos:pos+3]))

            for phrase in search_phrases:
                #logging.warning(phrase)
                if stemming:
                    phrase = ' '.join(stemmer.stemWords(phrase.split()))
                if current_user:
                    query = klass.all(keys_only=True).filter('phrases =', phrase).filter('view_permissions =',current_user.user_id()).order('ordinal') #.order('-rating')
                pub_query = klass.all(keys_only=True).filter('phrases =', phrase).filter('view_permissions =','public').order('ordinal') #.order('-rating')
                if kind:
                    if current_user:
                        query = query.filter('parent_kind =', kind)
                    pub_query = pub_query.filter('parent_kind =', kind)
                if current_user:
                    index_keys.extend([key for key in query.fetch(limit=limit-len(index_keys)) if key not in index_keys])
                index_keys.extend([key for key in pub_query.fetch(limit=limit-len(index_keys)) if key not in index_keys])


        if len(index_keys) < limit:
            new_limit = limit - len(index_keys)
            keywords = filter(lambda x: len(x) >= SEARCH_PHRASE_MIN_LENGTH, keywords)
            if stemming:
                keywords = stemmer.stemWords(keywords)
            for keyword in keywords:
                if current_user:
                    query = klass.all(keys_only=True).filter('phrases =', keyword).filter('view_permissions =',current_user.user_id()).order('ordinal') #.order('-rating')
                pub_query = klass.all(keys_only=True).filter('phrases =', keyword).filter('view_permissions =','public').order('ordinal') #.order('-rating')
                if kind:
                    if current_user:
                        query = query.filter('parent_kind =', kind)
                    pub_query = pub_query.filter('parent_kind =', kind)
                if current_user:
                    index_keys.extend([key for key in query.fetch(limit=limit-len(index_keys)) if key not in index_keys])
                index_keys.extend([key for key in pub_query.fetch(limit=limit-len(index_keys)) if key not in index_keys])

        return [(key.parent(), SearchIndex.get_title(key.name())) for key in index_keys]
Example #22
0
        #getting external links
        extl = find_between(text, "xternal links==", "\n\n")
        text = text.replace(extl, '')
        #getting references
        ref = find_between(text, "eferences==", "==") + find_between(text, "eferences ==", "==")
        text = text.replace(ref, '')

        #clearing up the dictionary, and working on each field
        article_dict = {}

        #TITLE
        field_tokens = []
        title = re.sub('[^A-Za-z]', ' ', title)
        chunk = nltk.word_tokenize(title.lower())
        stopped_tokens = [i for i in chunk if not i in stop_words]
        field_tokens = p_stemmer.stemWords(stopped_tokens)
        for i in field_tokens:
            if i in article_dict.keys():
                freq = int(find_between(article_dict[i], "(",")")) + 1
                if "T" in article_dict[i]:
                    article_dict[i] = find_between(article_dict[i], "", "(") + "(%d)" % freq
                else:
                    article_dict[i] = "T" + find_between(article_dict[i], "", "(") + "(%d)" % freq
            else:
                article_dict[i] = "T%d(1)" % count

        #BODY TEXT
        field_tokens = []
        text = re.sub('[^A-Za-z]', ' ', text)
        chunk = nltk.word_tokenize(text.lower())
        stopped_tokens = [i for i in chunk if not i in stop_words]
Example #23
0
def text_cleaner(text):
    text = text.lower()
    stemmer = Stemmer('russian')
    text = ' '.join(stemmer.stemWords(text.split()))
    text = re.sub(r'\b\d+\b', ' digit ', text)
    return text