Exemple #1
1
def preprocess_document(data):
    # Step 1: strip punctuation
    data = data.lower()
    punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']'
    , '{', '}', '#', '\\','/','@','\xa0','\n','&','$','‘','…','•','-'] 
    for punc in punctuation:
        data = data.replace(punc, '')
        
    # Step 2: tokenize 
    data = list(nltk.word_tokenize(data))
    
    # Step 3: strip stopwords
    stop = set(stopwords.words('english'))
    extra_stopwords = ['ok', 'oh', 'via','bc','gon','na'] # add any additional stopwords we want to use here
    stop.update(extra_stopwords)
    stop.update(list(string.ascii_lowercase)) # remove all single letters
    data = [i for i in data if i not in stop] # remove stopwords and sort result
    
    # Step 4: stemming
    stemmer = snowballstemmer.stemmer('english') 
    data = stemmer.stemWords(data)
    
    # Step 5: remove words not in NLTK english corpus
    words = set(nltk.corpus.words.words())
     for w in data:
        if w not in words:
            data.remove(w)
 def __init__(self, german):
     path = os.path.dirname(os.path.abspath(__file__))
     print(path)
     self.IX = open_dir(path + "/index")
     self.Writer = self.IX.writer()
     
     if german == True:
         self.Stemmer = snowballstemmer.stemmer('german')
     else:
         self.Stemmer = snowballstemmer.stemmer('french')
    def cut2list(self, string):
        """
        返回list
        :param string:
        :return:
        """

        tokens = []
        if self.replaceP == True:
            sens = split(string,
                         '' if self.type == Analyzer.ANALYZERS.Jieba else ' ')
        else:
            sens = [string]  #[strB2Q(string)]
        for sen in sens:
            if self.type == Analyzer.ANALYZERS.Jieba:
                # 使用jieba进行分词
                words = self.analyzer.cut(sen, cut_all=False)
            elif self.type == Analyzer.ANALYZERS.nltk:
                #使用英文进行分词
                sen = sen.lower()
                words = self.analyzer.word_tokenize(sen)
                stemmer = snowballstemmer.stemmer('english')
                # 参数是选择的语言
                words = stemmer.stemWords(words)

            if self.useStopwords == True:
                for word in words:
                    if word not in stopwords and len(word.strip()) > 0:
                        tokens.append(word)
            else:
                tokens += words
        return tokens
	def getHighlightingsVariables(self, article, variable_keywords, variable_pages):
		stemmer = snowballstemmer.stemmer("german")
		#goodchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÄÖÜäöüß'"
		for i in range(0, len(article)):
			for j in range(0, len(article[i])):
  				article[i][j] = article[i][j].split(" ");
				for k in range(0, len(article[i][j])):
					#article[i][j][k]=chrtran(article[i][j][k], goodchars, "")
					article[i][j][k]=stemmer.stemWord(article[i][j][k])


		for i in range(0, len(variable_keywords)):
			#variable_keywords[i]=chrtran(variable_keywords[i], goodchars, "")
			variable_keywords[i]=stemmer.stemWord(variable_keywords[i])

		highlight = []

		for i in range(0, len(article)):
			highlight_article = []
	
			for j in range(0, len(article[i])):
				highlight_variables = []
				for k in range(0, len(variable_keywords)):
	  				highlight_variables.append(random.random())
				highlight_article.append(highlight_variables)

			highlight.append(highlight_article)
			


	 	return highlight
Exemple #5
0
def clean(text, stemmer='snowball'):
    """Normalize, split, and clean text

    Parameters:
    -----------
    text : str
        Block of text to clean and prepare.
    stemmer : str, opt
        Stemmer to use: [snowball, five, simple]

    Returns:
    --------
    text : str
        Cleaned and prepared text block.
    """

    if not stemmer in ['snowball', 'five', 'simple', 'none']:
        raise ValueError("Stemmer choice not available.")

    text = re.sub("[{}]".format(string.punctuation), " ", text.lower())
    text = text.split()

    if stemmer == 'five':
        text = [five_stemmer(item) for item in text]
    elif stemmer == 'snowball':
        stemmer = snowballstemmer.stemmer('english');
        text = stemmer.stemWords(text)
    elif stemmer == 'simple':
        text = [simple_stem(item) for item in text]
    else:
        pass

    text = [item for item in text if not item in STOP_WORDS]

    return text
Exemple #6
0
def seeker_highlight(text, query, algorithm='english'):
    if not query:
        return mark_safe(seeker_format(text))
    try:
        import snowballstemmer
        stemmer = snowballstemmer.stemmer(algorithm)
        stemWord = stemmer.stemWord
        stemWords = stemmer.stemWords
    except:
        stemWord = lambda word: word
        stemWords = lambda words: words
    phrases = _phrase_re.findall(query)
    keywords_q = [
        w.lower() for w in re.split(r'\W+', _phrase_re.sub('', query)) if w
    ]
    highlight = set(stemWords(keywords_q))
    text = seeker_format(text)
    for phrase in phrases:
        text = re.sub('(' + re.escape(phrase) + ')',
                      r'<em>\1</em>',
                      text,
                      flags=re.I)
    parts = []
    for word in re.split(r'(\W+)', text):
        if stemWord(word.lower()) in highlight:
            parts.append('<em>%s</em>' % word)
        else:
            parts.append(word)
    return mark_safe(''.join(parts))
    def cut(self, string):
        """
        用分词器切词并用空格隔开
        :param string:
        :return: 返回格式是字符串
        """

        article_contents = ''
        sens = ''
        if self.replaceP == True:
            sens = split(string)
        else:
            sens = [string]  #strB2Q(string)
        for sen in sens:
            if self.type == Analyzer.ANALYZERS.Jieba:
                # 使用jieba进行分词
                words = self.analyzer.cut(sen, cut_all=False)
            elif self.type == Analyzer.ANALYZERS.nltk:
                #使用英文进行分词

                sen = sen.lower()
                words = self.analyzer.word_tokenize(sen)
                stemmer = snowballstemmer.stemmer('english')
                # 参数是选择的语言
                words = stemmer.stemWords(words)

            if self.useStopwords == True:
                for word in words:
                    if word not in stopwords and len(word.strip()) > 0:
                        article_contents += word + " "
            else:
                article_contents = ' '.join(words)

        return article_contents
Exemple #8
0
 def __init__(self, language="es"):
     """
     Init method
     :param language: input language
     """
     self.__stemmer = snowballstemmer.stemmer("spanish")
     Token.set_extension("stem", default="", force=True)
Exemple #9
0
def do_work(*args):
    import snowballstemmer
    stemmer = snowballstemmer.stemmer('english')
    print(js.data.textdata)
    txt = js.data.textdata
    newval = stemmer.stemWords(txt.split())
    return newval
Exemple #10
0
def stemming(lang, input, output, encoding, pretty):
    result = []
    stemmer = snowballstemmer.stemmer(lang)
    for original in codecs.open(input, "r", encoding).readlines():
        original = original.strip()
        # Convert only ASCII-letters to lowercase, to match C behavior
        original = ''.join(
            (lower_(c) if 'A' <= c <= 'Z' else c for c in original))
        stemmed = stemmer.stemWord(original)
        if result:
            result.append('\n')
        if pretty == 0:
            if stemmed != "":
                result.append(stemmed)
        elif pretty == 1:
            result.append(original, " -> ", stemmed)
        elif pretty == 2:
            result.append(original)
            if len(original) < 30:
                result.append(" " * (30 - len(original)))
            else:
                result.append("\n")
                result.append(" " * 30)
            result.append(stemmed)
    outfile = codecs.open(output, "w", encoding)
    outfile.write(''.join(result) + '\n')
    outfile.close()
Exemple #11
0
def do_semantic_analysis(sentence):
    sentence_probability_of_negative = 1
    sentence_probability_of_positive = 1
    stem = stemmer('turkish')

    stopwords_file = open("cookit.pythonanywhere.com/comments/text_files/stopwords.txt", "r").read()
    stopwords_list = stopwords_file.split("\n")
    words_list = sentence.split(" ")
    for word in words_list:
        word = re.sub(r'[^\w\s]', '', word)
        word = word.lower()
        x = [word]
        word = stem.stemWords(x)[0]
        if word in stopwords_list:
            continue
        else:
            try:
                word_probability_of_negative = ProbabilityOfWords.objects.get(word=word).probabilityOfNegative
            except ProbabilityOfWords.DoesNotExist:
                word_probability_of_negative = 1
            try:
                word_probability_of_positive = ProbabilityOfWords.objects.get(word=word).probabilityOfPositive
            except ProbabilityOfWords.DoesNotExist:
                word_probability_of_positive = 1

            sentence_probability_of_negative *= word_probability_of_negative
            sentence_probability_of_positive *= word_probability_of_positive

    if sentence_probability_of_positive > sentence_probability_of_negative:
        result = "positive"
    elif sentence_probability_of_positive < sentence_probability_of_negative:
        result = "negative"
    else:
        result = "notr"
    return result
def my_separate_samples(read_input_lines, stem_flag):

    input_splitted_list = []
    input_class_list = []

    if stem_flag == '1':
        print('stemmer')
        my_stemmer = sb.stemmer('turkish')

    for curr_line in read_input_lines:
        curr_line2 = curr_line.lower()
        exclude = string.punctuation
        curr_line3 = ''.join(ch for ch in curr_line2 if ch not in exclude)
        curr_line4 = curr_line3.split('\t')
        curr_sample = curr_line4[0].split()
        curr_sample = list(set(curr_sample))
        curr_class = curr_line4[1].replace('\n', '')

        if stem_flag == '1':
            stemmed_curr_sample = []
            for wt in curr_sample:
                if len(wt) > 5:
                    stemmed_curr_sample.append(my_stemmer.stemWord(wt))
                else:
                    stemmed_curr_sample.append(wt)
            curr_sample = stemmed_curr_sample

        input_splitted_list.append(curr_sample)
        input_class_list.append(curr_class)

    return input_splitted_list, input_class_list
Exemple #13
0
 def __init__(self) -> None:
     warnings.warn(
         f"{self.__class__.__name__} is deprecated, use "
         "snowballstemmer.stemmer('porter') instead.",
         RemovedInSphinx70Warning,
         stacklevel=2)
     self.stemmer = snowballstemmer.stemmer('porter')
Exemple #14
0
def WordTabLemma(fin, fout):
    '''Convert one word per line format to word-tab-lemma per line format.'''
    stemmer = snowballstemmer.stemmer('english')
    with open(fin, 'rt') as fi, open(fout, 'wt') as fo:
        for word in fi:
            word = word.strip()
            fo.write("{}\t{}\n".format(word, stemmer.stemWord(word)))
Exemple #15
0
    def __name_follows(self, token):
        """split the token based on letters start with

        Args:
            token (str): a word

        Returns:
            list: splited word
        """
        follows = [
            '\u0628',  # ب
            '\u0643',  # ك
            '\u0644',  # ل
            '\u0648',  # و
            '\u062a',  # ت
            '\u0633'
        ]
        stem = stemmer("arabic").stemWord(token)
        for follow in follows:
            if token.startswith(follow) and not stem.startswith(follow):
                token = re.sub(follow,
                               r'\g<0><SPLIT>',
                               token,
                               flags=re.UNICODE)
        return token.split("<SPLIT>")
Exemple #16
0
def stemmer(pList):
    stemmer = snowballstemmer.stemmer('spanish')
    stemmedWords = set([])
    for word in pList:
        stemmed = stemmer.stemWord(word)
        stemmedWords.add(word)
    return stemmedWords
Exemple #17
0
def stemming(lang, input, output, encoding, pretty):
    result = []
    stemmer = snowballstemmer.stemmer(lang)
    for original in codecs.open(input, "r", encoding).readlines():
        original = original.strip()
        # Convert only ASCII-letters to lowercase, to match C behavior
        original = ''.join((lower_(c) if 'A' <= c <= 'Z' else c for c in original))
        stemmed = stemmer.stemWord(original)
        if result:
            result.append('\n')
        if pretty == 0:
            if stemmed != "":
                result.append(stemmed)
        elif pretty == 1:
            result.append(original, " -> ", stemmed)
        elif pretty == 2:
            result.append(original)
            if len(original) < 30:
                result.append(" " * (30 - len(original)))
            else:
                result.append("\n")
                result.append(" " * 30)
            result.append(stemmed)
    outfile = codecs.open(output, "w", encoding)
    outfile.write(''.join(result) + '\n')
    outfile.close()
Exemple #18
0
    def init(self, options: Dict) -> None:
        if JIEBA:
            dict_path = options.get('dict')
            if dict_path and os.path.isfile(dict_path):
                jieba.load_userdict(dict_path)

        self.stemmer = snowballstemmer.stemmer('english')
Exemple #19
0
def clean(text, stemmer='snowball'):
    """Normalize, split, and clean text

    Parameters:
    -----------
    text : str
        Block of text to clean and prepare.
    stemmer : str, opt
        Stemmer to use: [snowball, five, simple]

    Returns:
    --------
    text : str
        Cleaned and prepared text block.
    """

    if not stemmer in ['snowball', 'five', 'simple', 'none']:
        raise ValueError("Stemmer choice not available.")

    text = re.sub("[{}]".format(string.punctuation), " ", text.lower())
    text = text.split()

    if stemmer == 'five':
        text = [five_stemmer(item) for item in text]
    elif stemmer == 'snowball':
        stemmer = snowballstemmer.stemmer('english')
        text = stemmer.stemWords(text)
    elif stemmer == 'simple':
        text = [simple_stem(item) for item in text]
    else:
        pass

    text = [item for item in text if not item in STOP_WORDS]

    return text
Exemple #20
0
 def __init__(self, language=None):
     """Create a new highlighter for the specified language.
     
     """
     if language:
         self.stem = snowballstemmer.stemmer(language)
     else:
         self.stem = NoStem()
Exemple #21
0
def gen_words(text, stemming=stem.stemmer('english')):
    """Create generator.
    :param text: some string
    :param stemming: variant of stemming algorithm
    :return: generator giving stemmed words from text
    """
    for word in stemming.stemWords(re.findall(r"[\w']+", text.lower())):
        yield word
Exemple #22
0
def turkish(sent):
    # No turkish stemmer in NLTK
    stem = snowballstemmer.stemmer('turkish')
    stop = stopwords.words('turkish')
    tx  = word_tokenize(sent)
    mx = stem.stemWords(tx)
    px = [x for x in mx if x not in stop]
    return px
Exemple #23
0
 def __init__(self, language=None):
     """Create a new highlighter for the specified language.
     
     """
     if language:
         self.stem = snowballstemmer.stemmer(language)
     else:
         self.stem = NoStem()
Exemple #24
0
def turkish(sent):
    # No turkish stemmer in NLTK
    stem = snowballstemmer.stemmer('turkish')
    stop = stopwords.words('turkish')
    tx = word_tokenize(sent)
    mx = stem.stemWords(tx)
    px = [x for x in mx if x not in stop]
    return px
Exemple #25
0
 def __init__(self, N=8):
     """
     Create the object
     :param int N: max length of the suffix used in the rules
     """
     self.N = N
     self._rules = [None] * (N + 1)
     self._stemmer = sbs.stemmer('italian')
def stem2(in_vec):
    stemmer = snowballstemmer.stemmer('english')
    out_vec = []
    for x in in_vec:
        to_out = stemmer.stemWord(x)
        if(len(to_out) > 2):
            out_vec.append(to_out)
    return out_vec
Exemple #27
0
def get_feature_base(sentence):
    stemmer = snowballstemmer.stemmer('english')
    words = sentence.split()
    result = []
    for word in words:
        if is_stopword(word):
            continue
        result.append(stemmer.stemWord(word))
    return ' '.join(result)
Exemple #28
0
def get_stemmer(language):
    stemmer_languages = [
        "danish", "dutch", "english", "finnish", "french", "german",
        "hungarian", "italian", "norwegian", "portuguese", "romanian",
        "russian", "spanish", "swedish", "turkish"
    ]
    if language.lower() in stemmer_languages:
        return lambda word: snowballstemmer.stemmer(language).stemWord(word)
    return lambda word: word
Exemple #29
0
def stem_and_lower(str_):
    """
    Returns string with unique lowercase words stemmed.
    """
    stemmer = snowballstemmer.stemmer(config.LANGUAGE_FULL)
    str_no_punctuation = str_.translate(REMOVE_PUNCTUATION_MAP)
    str_stemmed = stemmer.stemWords(
        map(lambda x: x.lower(), set(str_no_punctuation.split())))
    return ' '.join(str_stemmed)
def aplicarStemmer(pDictPalabrasArchivos):
    print("aplicando stemming...")
    dictRaices = {}
    stemmer = snowballstemmer.stemmer("spanish")
    for docId, palabras in pDictPalabrasArchivos.items():
        raices = stemmer.stemWords(palabras)
        dictRaices[docId] = raices
    ##    archivo.archivo.crearCSVDict(".\stemming.csv",dictRaices)
    return dictRaices
Exemple #31
0
def rootsoftheliturgical(words):
    words=words.lower()
    rootfind = stemmer('turkish')
    trans=str.maketrans('', '', punctuation)
    words = words.translate(trans)
    words=StopWords(words)
   # letters = words.split()
    letters = rootfind.stemWords(words)
    string =' '.join(letters)
    return string
 def __init__(self, xml):
     self.dest = xml.get("dest")
     if self.dest is None:
         raise ValueError()
     self.verbose = xml.get("verbose")
     if self.verbose is None:
         self.verbose = False
     else:
         self.verbose = True
     self.stemmer =  snowballstemmer.stemmer('english')
Exemple #33
0
def main():
    stemmer = snowballstemmer.stemmer('english')  # stemmingモジュール読み込み
    for i, line in enumerate(sentence_extraction()):
        if i == 10:
            break
        words = line.strip('\n').split(' ')
        for word in words:
            # 各単語に対して、stemmer.stemWord(word)でステミング処理
            print('{}\t{}'.format(word, stemmer.stemWord(word)))
        print('\n')
Exemple #34
0
def snowball_tokenfilter(token):
    """
    Snowball token filter

    uses the Snowball stemming library collection for python:
      https://github.com/shibukawa/snowball_py
    """
    stemmer = snowballstemmer.stemmer("english")
    token["token"] = stemmer.stemWord(token["token"])
    return token
Exemple #35
0
    def __init__(self, **kwargs):
        self.basic_params = kwargs['basic']
        self.emb_params = kwargs['embeddings']
        self.sim_params = kwargs['similarity']
        self.subj_params = kwargs['subjectivity']
        self.sent_params = kwargs['sentiment']
        self.emo_params = kwargs['emotion']

        self.nlp = spacy.load(self.basic_params['model'])
        self.stemmer = stemmer('greek')
        self.lexicon_ = None
Exemple #36
0
def stem2(word):

    stemmer = snowballstemmer.stemmer("turkish")
    stemmed = stemmer.stemWord(word)

    if stemmed == "fatur":
        stemmed = "fatura"
    elif stemmed == "hatt":
        stemmed = "hat"

    return stemmed
def clean_text_stemmed(t):
    """Accepts a Document
    """
    t = t.lower()
    # Remove single characters
    t = re.sub("[^A-Za-z0-9]", " ", t)
    # Replace all numbers by a single char
    t = re.sub("[0-9]+", "#", t)
    stemmer = snowballstemmer.stemmer('english')
    tfinal = " ".join(stemmer.stemWords(t.split()))
    return t
Exemple #38
0
def text_cleaner(text):
    stemmer = snowballstemmer.stemmer('russian')
    text = text.lower()  # приведение в lowercase,
    text = re.sub(r'https?://[\S]+', ' url ', text)  # замена интернет ссылок
    text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', text)
    text = re.sub(r'<[^>]*>', ' ', text)  # удаление html тагов
    text = re.sub(r'[\W\n]+', ' ', text)  # удаление лишних символов
    text = re.sub(r'\w*\d\w*', '', text)  # замена цифр
    text = re.sub(r'\w*[.]\w*', '', text)  # замена цифр
    text = ' '.join(stemmer.stemWords(text.split()))  # Выделение корней
    return text
def aplicarStemmerConsulta(pLista):
    #print(pLista)
    print("aplicando stemming...")
    lista = []
    stemmer = snowballstemmer.stemmer('spanish')
    for i in pLista:
        #print(i[0])
        raiz = stemmer.stemWords([i[0]])[0]
        lista.append([raiz,i[1]])
        #print(i[0])
    #print(lista)
    return lista
Exemple #40
0
def create_search_terms(string_terms):
  ''' Creates search terms by stemming every word within the parameter passed.
  Returns all search terms in one string separated by space'''
  stemmer = snowballstemmer.stemmer('english')
  terms = stemmer.stemWords(string_terms.split())

  search_term = list()
  for term in terms:
    lower_term = term.lower()
    if not lower_term in _STOP_WORDS:
      search_term.append(lower_term)

  return " ".join(search_term)
 def __init__(self, samples=None, stopwords="english", limit=20, logging=False):
     """
     Create a vocabulary which is a mapping from bucket names to lists of
     synonyms that fall into their bucket. Stopwords is a list of words that
     are ignored for the vocabulary and defaults to a built-in english
     stopword list.
     """
     self.stopwords = stopwords
     self.stemmer = snowballstemmer.stemmer("english")
     self.tokens = re.compile(r"[A-Z]?[a-z]{2,}")
     self.logging = logging
     if samples:
         self._generate_vocabulary(samples, limit)
Exemple #42
0
def search_result(request):
    query = request.POST.get('query')
    q_words = query.split()
    stemmed_words = []
    for word in q_words:
        lng = detect(word)
        if lng in LANGUAGES:
            lng = LANGUAGES[lng]
            stemmed_words.append(snowballstemmer.stemmer(lng).stemWord(word))
        else:
            stemmed_words.append(word)

    return render(request, 'searchres/search_result.html', {})
def getPalabras():
    file = "dicc.txt"

    arc = open(file, 'r')
    stemmer = snowballstemmer.stemmer('spanish');
         
    
    words = {}
    for i in arc:
        i = i.rstrip()
        i = stemmer.stemWord(i)
        words[i] = "word"
    
    for i in words.items():
        print i
    
    print len(words) 
Exemple #44
0
def get_coursed_and_create_matrix():
    results = [course for course in modulestore().get_courses() if course.scope_ids.block_type == "course"]
    new_matrix = TfidMatrixAllCourses.objects.all().first() or TfidMatrixAllCourses()
    print new_matrix.matrix.shape[0] != len(results)
    if new_matrix.matrix.shape[0] != len(results):
        all_courses = [re.sub("<[^>]*>", "", CourseDetails.fetch_about_attribute(x.id, "overview")) for x in results]

        MatrixEdxCoursesId.objects.all().delete()
        map(lambda x: MatrixEdxCoursesId.objects.create(course_key=x.id, course_index=results.index(x)), results)

        stemmer = snowballstemmer.stemmer("english")
        courses_stem = [" ".join(stemmer.stemWords(x.split())) for x in all_courses]

        vect = TfidfVectorizer(stop_words=get_stop_words(), lowercase=True, dtype=np.float32)
        matrix = vect.fit_transform(courses_stem)
        new_matrix.matrix = matrix
        new_matrix.save()
Exemple #45
0
 def identify_language(self, text):
     self.lang = lang_mapping[langid.classify(text)[0]]
     if self.debug: print "LANG", self.lang#, "stemmer", self.stem
     
     if self.lang == "greek":
         from stemmers.greek import stem, stopwords 
         self.stem = stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     elif self.lang == "turkish": # unfortunately, turkish stemmer isnt included in nltk
         import snowballstemmer
         from stemmers.turkish import stopwords 
         self.stem = snowballstemmer.stemmer("turkish").stemWord
         self.legal_token = partial(self.legal_token, exclude_list=stopwords)
     else:
         from nltk.stem import SnowballStemmer
         from nltk.corpus import stopwords
         self.stem = SnowballStemmer(self.lang).stem
         self.legal_token = partial(self.legal_token, exclude_list=stopwords.words(self.lang))
Exemple #46
0
def checkon(fn, o):
	if not os.path.exists(fn) or os.path.isdir(fn):
		fn = fn + '.json'
	if 'title' not in o.json.keys():
		if verbose:
			print('No title in', o.getKey())
		return 1 # no title
	# check for a different language - to avoid stemming altogether
	if o.tags and ('german' in o.tags or 'french' in o.tags or 'portuguese' in o.tags):
		if 'stemmed' in o.json.keys():
			# if stemmed before marked foreign, remove this info
			del o.json['stemmed']
			F = open(fn, 'w')
			F.write(o.getJSON())
			F.close()
			return 2
		else:
			return 0
	changed = False
	### champion variant: snowballstemmer - runs in ~13.5s for 96027 titles
	stemmer = snowballstemmer.stemmer('english').stemWords
	### disregarded variant: snowballstemmer porter - considered outdated
	# stemmer = snowballstemmer.stemmer('porter').stemWords
	### disregarded variant: stemming - too slow, runs in ~33s for 96027 titles
	# stemmer = lambda xs: [stemming.porter2.stem(x) for x in xs]
	### disregarded variant: nltk - worse on verbs ending with -ze
	# stemmer3 = lambda xs: [SnowballStemmer("english").stem(x) for x in xs]
	### end variants
	stemmed = stemmer(string2words(o.get('title')))
	if '' in stemmed:
		print('“{}” is a title of {} and it has an empty word'.format(o.get('title'), C.red(o.getKey())))
		print(string2words(o.get('title')))
		print(stemmer(string2words(o.get('title'))))
	ALLSTEMS.update(stemmed)
	if o.get('stemmed') != stemmed:
		o.json['stemmed'] = stemmed
		changed = True
	if changed:
		F = open(fn, 'w')
		F.write(o.getJSON())
		F.close()
		return 2
	else:
		return 0
Exemple #47
0
def main():
    argv = sys.argv
    if len(argv) < 2:
        usage()
        return
    algorithm = 'english'
    if len(argv) > 2:
        algorithm = argv[1]
        argv = argv[2:]
    else:
        argv = argv[1:]
    stemmer = snowballstemmer.stemmer(algorithm)
    splitter = re.compile(r"[\s\.-]")
    for arg in argv:
        for word in splitter.split(arg):
            if word == '':
                continue
            original = word.lower()
            print(original + " -> " + stemmer.stemWord(original))
def preprocess_features(dataframe):
    # get the count of how many times each product appears, may correlate
    product_counts = pandas.DataFrame(pandas.Series(dataframe.groupby(["product_uid"]).size(), name="product_count"))
    dataframe = pandas.merge(dataframe, product_counts, left_on="product_uid", right_index=True, how="left")

    dataframe = experiment_gensim(dataframe)

    dataframe["search_length"] = dataframe.search_term.str.len()

    dataframe["id_bins"] = pandas.cut(dataframe.id, 20, labels=False)

    # word distribution metrics
    dataframe["title_unigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1), axis=1)
    dataframe["title_bigram_overlap"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2), axis=1)

    dataframe["desc_unigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1), axis=1)
    dataframe["desc_bigram_overlap"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2), axis=1)

    dataframe["brand_unigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(1), axis=1)
    dataframe["brand_bigram_overlap"] = dataframe[["search_term", "brand_name"]].apply(make_ngram_match(2), axis=1)

    # stemmed unigrams
    stemmer = snowballstemmer.stemmer("english")
    dataframe["title_unigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1)
    dataframe["desc_unigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(1, stemmer=stemmer.stemWord), axis=1)
    dataframe["title_bigram_overlap_stemmed"] = dataframe[["search_term", "product_title"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1)
    dataframe["desc_bigram_overlap_stemmed"] = dataframe[["search_term", "product_description"]].apply(make_ngram_match(2, stemmer=stemmer.stemWord), axis=1)


    # edit distance metrics (slow)
    dataframe["title_word_edit_distance"] = dataframe[["search_term", "product_title"]].apply(word_edit_distance, axis=1)
    dataframe["title_char_edit_distance"] = dataframe[["search_term", "product_title"]].apply(char_edit_distance, axis=1)
    # dataframe["desc_word_edit_distance"] = dataframe[["search_term", "product_description"]].apply(word_edit_distance, axis=1)
    # dataframe["desc_char_edit_distance"] = dataframe[["search_term", "product_description"]].apply(char_edit_distance, axis=1)

    dataframe = dataframe.drop(["product_title", "search_term", "id", "product_description", "brand_name"], axis=1)

    print(dataframe.describe())

    return dataframe
Exemple #49
0
def textrank(text, hdr):
    sent_tokenizer = PunktSentenceTokenizer()
    sentences = sent_tokenizer.tokenize(text)
    word_tokenizer = RegexpTokenizer(r'\w+')

    # finding out the most possible language of the text
    lang_code = lang_identifier.classify(' '.join([hdr, text]))[0]

    stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english'))
    words = [set(stemmer.stemWord(word) for word in word_tokenizer.tokenize(sentence.lower()))
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True), lang_code
Exemple #50
0
def seeker_highlight(text, query, algorithm='english'):
    try:
        import snowballstemmer
        stemmer = snowballstemmer.stemmer(algorithm)
        stemWord = stemmer.stemWord
        stemWords = stemmer.stemWords
    except:
        stemWord = lambda word: word
        stemWords = lambda words: words
    phrases = _phrase_re.findall(query)
    keywords = [w.lower() for w in re.split(r'\W+', _phrase_re.sub('', query)) if w]
    highlight = set(stemWords(keywords))
    text = seeker_format(text)
    for phrase in phrases:
        text = re.sub('(' + re.escape(phrase) + ')', r'<em>\1</em>', text, flags=re.I)
    parts = []
    for word in re.split(r'(\W+)', text):
        if stemWord(word.lower()) in highlight:
            parts.append('<em>%s</em>' % word)
        else:
            parts.append(word)
    return ''.join(parts)
def score_words_in_sentence(l_sentence, best_stops):
    """
    Define a score of relevance for each word in tweet
    We consider only non stop words, stemmed words either for tweet and stop name
    :param l_sentence: tweet split in list of words
    :param best_stops: list of more relevant stops
    :return: 1. list of kept/stemmed words found in tweet
             2. list of scores for each of these words
    """
    sb_stemmer = stemmer('french')
    stemmed_sentence = [sb_stemmer.stemWords([x])[0] for x in l_sentence]
    scores_stops = []
    tag_words = [0 for _ in l_sentence]
    relevant_stops = map(lambda x: unicodedata.normalize('NFD', x[0][0]).encode('ascii', 'ignore'), best_stops)
    rg_stop = 1
    for stop in relevant_stops:
        stop_w_index = []
        stop_lw = re.findall("\w+", stop, re.UNICODE)
        for w_stop in stop_lw:
            if not w_stop in fr_stop_words:
                stemmed_w = sb_stemmer.stemWords([w_stop])[0]
                if stemmed_w in stemmed_sentence:
                    stop_w_index.append(stemmed_sentence.index(stemmed_w))
        score_w = 0
        for i in range(len(stop_w_index)):
            if (i > 0) and (stop_w_index[i] <= stop_w_index[i-1]):
                score_w = 0
                break
            else:
                score_w += 1
        scores_stops.append(score_w)
        if score_w != 0:
            for idx in stop_w_index:
                if tag_words[idx] == 0:
                    tag_words[idx] = rg_stop
        rg_stop += 1
    return scores_stops, tag_words
Exemple #52
0
def get_frequencies(word_dict, date, max_date):
    stemmer = snowballstemmer.stemmer('dutch')    
    freqs = defaultdict(lambda: defaultdict(int))
    freqs_per_day = defaultdict(lambda: defaultdict(int))
    oneday = datetime.timedelta(1)

    # print("min", date, "\nmax", max_date)
    current_year = date.year
    print("current year", current_year)
    
    while date <= max_date:
        # print("current date", date)
        has_file = True
        if current_year!=date.year:
            current_year = date.year
            print("current year", current_year)
        try:
            f_in = open(input_path+r"\words"+str(date)+".txt", "r")
        except IOError:
            # print("File not found\n", input_path+r"\words"+str(date)+".txt")
            has_file = False
        if has_file:
            for line in f_in:
                line = line.replace("\n", "")
                line = line.split(";")
                word = line[0]
                freq = int(line[1])                
                if use_stemmer:
                    word = stemmer.stemWord(word)
                if word_dict[word][0]:
                    if log_correlation:
                        freqs[word][date] = freq
                    freqs_per_day[date][word] = freq                      
            f_in.close()
        date+=oneday
                            
    return freqs, freqs_per_day
Exemple #53
0
def stemming(lang, input, output, encoding, pretty):
    result = []
    stemmer = snowballstemmer.stemmer(lang)
    for original in codecs.open(input, "r", encoding).readlines():
        original = original.strip()
        stemmed = stemmer.stemWord(original)
        if result:
            result.append('\n')
        if pretty == 0:
            if stemmed != "":
                result.append(stemmed)
        elif pretty == 1:
            result.append(original, " -> ", stemmed)
        elif pretty == 2:
            result.append(original)
            if len(original) < 30:
                result.append(" " * (30 - len(original)))
            else:
                result.append("\n")
                result.append(" " * 30)
            result.append(stemmed)
    outfile = codecs.open(output, "w", encoding)
    outfile.write(''.join(result) + '\n')
    outfile.close()
Exemple #54
0
 def init(self, options):
     self.stemmer = snowballstemmer.stemmer('russian')
 def lemmatizer(self ,word):
     stemmer = snowballstemmer.stemmer('spanish');
     return stemmer.stemWord(word)
Exemple #56
0
 def init(self, options):
     # type: (Any) -> None
     self.stemmer = snowballstemmer.stemmer('italian')
#!/usr/bin/python
# -*- coding: iso-8859-9 -*-

import argparse, os, re, sys, operator, math
import snowballstemmer

FILE_ENCODING = "windows-1254"
SMOOTHING_CONST = 0.1
STEMMING = True

# Create stemmer
stemmer = snowballstemmer.stemmer("turkish")

# prior_prob is a dictionary contains prior probabilities of authors
prior_prob = dict()

# word_prob is a dictionary contains dictionaries of authors which include word probabilities
word_prob = dict()

# Dictionary includes the number of words in the all training data of each author
total_words = dict()

# total_docs is the number of documents in training set
total_docs = 0

# authors is the list contains all author names
authors = []

# unknown word probabilities for each author
unknown_prob = dict()
Exemple #58
0
 def init(self, options):
     # type: (Any) -> None
     self.stemmer = snowballstemmer.stemmer('danish')
Exemple #59
0
 def init(self, options):
     # type: (Any) -> None
     self.stemmer = snowballstemmer.stemmer('portuguese')
Exemple #60
0
# coding=utf-8
import re
import codecs
import cPickle as pickle
import pymorphy2
from snowballstemmer import stemmer

__author__ = 'annie'

__morph = pymorphy2.MorphAnalyzer()
__stemmer = stemmer('russian')
__pattern = re.compile(u'(?u)[A-zА-я]{2,}')

open_read = lambda file: codecs.open(file, encoding='utf-8', mode='r')
open_write = lambda file: codecs.open(file, encoding='utf-8', mode='w')


def str_dict(dict_):
    """
    Right conversion dict to string. Without nesting level of values.
    :param dict_: some dict
    :return: str
    """
    ans = [u'{0}: {1}'.format(k, v) for k, v in sorted(dict_.items())]
    return u'\n'.join(ans)


def print_dict(dict_):
    print(str_dict(dict_), u'\n\n{0} ключей'.format(len(dict_)))