コード例 #1
0
ファイル: common.py プロジェクト: Overhaug/HuJuRecSys
def stemming_and_stopwords(text):
    stemmer = SnowballStemmer("english")
    stop = stopwords.words("english")
    text = text.apply(lambda x: x.split())
    # text = text.apply(lambda word_list: [w for w in word_list if w not in stop])
    return text.apply(lambda word_list: " ".join(
        [stemmer.stem(w) for w in word_list if w not in stop]))
コード例 #2
0
class SnowballStemmerNormalizer(Normalizer):
    """
    A Normalizer that uses the NLTK SnowballStemmer to normalize tokens.
    """

    def __init__(self, language='english'):
        self.language = language
        self.stemmer = SnowballStemmer(self.language)

    def normalize(self, token):
        """
        Apply normalization techniques over the token to simplify its structure.
        :param token:
        :return: A stem of the token, or the token, if a stem could not be produced.
        """

        return self.stemmer.stem(token)

    def normalize_list(self, tokens):
        """
        Normalize a entire list of tokens.
        :param tokens: A list of tokens to be normalized
        :return: Yields a normalized token
        """
        for token in tokens:
            yield self.stemmer.stem(token)
コード例 #3
0
    def stemming_text(self, words):

        stemmer = SnowballStemmer("spanish")
        final_text = []
        for word in words:
            final_text.append(stemmer.stem(word))
        return final_text
コード例 #4
0
def tokenize(text, stemming=True, stoplist=[], remove_digits=False, lang='en'):
    translator = str.maketrans(
        string.punctuation,
        ' ' * len(string.punctuation))  # map punctuation to space
    text = text.translate(translator)
    text = text.lower()
    text = text.strip()
    table = str.maketrans({key: None for key in string.punctuation})
    text = text.translate(table)
    if stemming:
        if lang == 'en':
            stemmer = Stemmer()
        elif lang == 'it':
            stemmer = SnowballStemmer('italian')
        elif lang == 'de':
            stemmer = SnowballStemmer('german')
        elif lang == 'fa':
            stemmer = paStemmer()
        analyzer = StemmingAnalyzer(stoplist=stoplist,
                                    minsize=1,
                                    stemfn=stemmer.stem)
    else:
        analyzer = StandardAnalyzer(stoplist=stoplist, minsize=1)

    tokens = [token.text for token in analyzer(text)]
    if remove_digits:
        tokens = [
            word for word in tokens
            if not contains_digits(word) and 2 <= len(word)
        ]
    return tokens
コード例 #5
0
 def target_stemming_spanish(self, words):
     result = ""
     wordset = words.split(" ")
     stemmer = SnowballStemmer('spanish')
     for word in wordset:
         result += stemmer.stem(word) + "_"
     return result
コード例 #6
0
def stem_text(text: str, lang_code: str) -> [str]:
    if lang_code in languages.languages.keys():
        tokens = word_tokenize(text)
        stemmer = SnowballStemmer(languages.languages[lang_code])
        stems = [stemmer.stem(token) for token in tokens]
        return stems
    return []
コード例 #7
0
    def text_process(self, text):
        # Remove punctutation
        no_punc = [
            char.lower() for char in text if char not in string.punctuation
        ]
        # Join the characters again to form the string.
        no_punc = ''.join(no_punc)
        # Remove any stopwords
        try:
            no_stopwords = [
                word for word in no_punc.split()
                if word.lower() not in stopwords.words(self.language)
            ]
        except LookupError:
            nltk.download('stopwords')
            no_stopwords = [
                word for word in no_punc.split()
                if word.lower() not in stopwords.words(self.language)
            ]
        result = no_stopwords

        if self.tagging:
            # Tag each word
            tagged_words = self._tag_text(result)
            # Remove unwanted tags
            extracted_tags = self._extract_tags(tagged_words)
            result = extracted_tags

        if self.stemming:
            # Stem it
            stemmer = SnowballStemmer(self.language)
            result = [stemmer.stem(word) for word in result]

        return result
コード例 #8
0
 def sentence_stemming(sentence):
     options = {
         "ar": "arabic",
         "da": "danish",
         "nl": "dutch",
         "en": "english",
         "fi": "finnish",
         "fr": "french",
         "de": "german",
         "hu": "hungarian",
         "it": "italian",
         "no": "norwegian",
         "pt": "portuguese",
         "ro": "romanian",
         "ru": "russian",
         "es": "spanish",
         "sw": "swedish"
     }
     c = detect(sentence)
     try:
         stemmer = SnowballStemmer(options[c])
     except KeyError:
         print("Language not supported")
         sys.exit()
     s = "".join(stemmer.stem(i) + " " for i in sentence.split())
     return "".join(s + " " for s in word_tokenize(s)
                    if s not in set(stopwords.words(options[c])))
コード例 #9
0
def bag_of_words_spacy2(dataset):
    import spacy
    nlp = spacy.load('es_core_news_md')
    spanishstemmer = SnowballStemmer("spanish")
    all_stopwords = stopwords.words('spanish')
    all_stopwords.extend(
        ("saludo", "dia", "noche", "noches", "tardes", "buenos", "buenas",
         "atentamente", "dias", "estimado", "estimados", "estimada", "atte",
         "hola", "gracia", "caja", "respuesta", "adjunto", "mucha", "me",
         "cordoba", "buen", "ud"))
    removeList = ["no", "nunca"]
    all_stopwords = [e for e in all_stopwords if e not in removeList]
    corpus = []
    for i, value in dataset.items():
        review = str(html.unescape(dataset[i]))
        review = cleanhtml(review)
        review = re.sub(
            r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+",
            r"\1", normalize("NFD", review), 0, re.I)
        review = normalize('NFC', review)
        review = re.sub('[^a-zA-Zá-ú0-9]', ' ', review)
        review = review.lower()
        doc = nlp(review)
        stems = [
            spanishstemmer.stem(token) for token in doc
            if not token in set(all_stopwords)
        ]
        review = ' '.join(stems)
        corpus.append(review)
    return corpus
コード例 #10
0
 def __init__(self, lang):
     lang_ipa = {'es': 'spa-Latn', 'en': 'eng-Latn'}
     lang_stemm = {'es': 'spanish', 'en': 'english'}
     self.lang = lang
     self.stemmer = SnowballStemmer(language=lang_stemm[lang])
     self.epi = epitran.Epitran(lang_ipa[lang])
     self.nlp = self.load_sapcy(lang)
コード例 #11
0
    def quadratic(cls, language: 'model.Language'):
        """
                This estimator computes the ratio of new words for a given user and language
                :param language: language of the text that needs to be estimated
                :param user: the user for which the difficulty estimation needs to be done
                :rtype: WordHistoryDifficultyEstimator
                :return: WordHistoryDifficultyEstimator with initialized user, language and word => score map
                        which can be used for determining scores for multiple articles for the same user and language
        """

        estimator = cls(language)

        freq_list = load_language_from_hermit(language.code)

        word_dict = dict()
        for k, v in freq_list.word_info_dict.items():
            word_dict[k] = v.frequency

        stemmer = SnowballStemmer(language.name.lower())

        score_map = defaultdict(int)

        for k, v in word_dict.items():
            score_map[stemmer.stem(k.lower())] += v

        max_freq = max(score_map.values())

        for k in score_map.keys():
            score_map[k] = (1 - score_map[k] / max_freq)**0.5

        estimator.score_map = score_map

        return estimator
コード例 #12
0
ファイル: nlp_methods.py プロジェクト: kewilliams86/CSC-450
def stemLine (text):

    snow = SnowballStemmer('english')
    
    text = [snow.stem(t) for t in text.split()]

    return (' ').join(text)
コード例 #13
0
def bag_of_words_spacy(dataset):
    import spacy
    spanishstemmer = SnowballStemmer("spanish")
    nlp = spacy.load('es_core_news_md')
    nlp.Defaults.stop_words |= {
        "saludo", "dia", "noche", "noches", "tardes", "buenos", "buenas",
        "atentamente", "dias", "hola", "estimado", "estimados", "estimada",
        "atte"
    }
    nlp.Defaults.stop_words -= {"no", "nunca"}
    corpus = []
    for i, value in dataset.items():
        review = str(html.unescape(dataset[i]))
        review = cleanhtml(review)
        doc = nlp(review)
        words = [t.orth_.lower() for t in doc if not t.is_punct | t.is_stop
                 ]  #elimina signos de puntuacion y stopwords
        #lexical_tokens = [t.lower() for t in words if len(t) > 2 and t.isalpha()] # pasa a minuscula, elimina pal de 2letras y num
        review = ' '.join(words)
        doc = nlp(review)
        lemmas = [tok.lemma_.lower() for tok in doc]
        stems = [spanishstemmer.stem(token) for token in lemmas]
        review = ' '.join(stems)
        corpus.append(review)
    return corpus
コード例 #14
0
    def __init__(self, min_occurrence=10, window=15, from_corpus=False):
        self.min_occurrence = min_occurrence
        self.window = window

        # map words to integers (more memory efficient and faster)
        self.word2int_count = count()
        self.word2int = defaultdict(self.word2int_count.__next__)

        # map city names also to ints
        self.city2int_count = count()
        self.city2int = defaultdict(self.city2int_count.__next__)

        self.stemmer = SnowballStemmer('german')
        self.stopwords = set(stopwords.words('german')).union(STOP_CITIES)
        self.stems = defaultdict(lambda: defaultdict(int))

        self.cores = multiprocessing.cpu_count()

        if from_corpus:
            print("loading spacy", file=sys.stderr, flush=True)
            self.nlp = spacy.load('de',
                                  parser=False,
                                  tagger=True,
                                  entity=False)
            print("done...", file=sys.stderr, flush=True)
コード例 #15
0
def normalizeWords(text):
    ''' Text preprocessing '''
    stemmer = SnowballStemmer(language='english')
    test = re.compile(r'\W+', re.UNICODE).split(text[0].lower())
    stop_words = [
        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
        'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
        'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
        'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
        'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
        'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
        'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
        'with', 'about', 'against', 'between', 'into', 'through', 'during',
        'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
        'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
        'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
        'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
        'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
        't', 'can', 'will', 'just', 'don', 'should', 'now', 'html'
    ]
    test = [
        stemmer.stem(word) for word in test
        if not word in stop_words and word.isalpha() and len(word) > 2
    ]
    return (test, text[1], len(test))
コード例 #16
0
    def embers_stem(x):
        """
        DESCRIPTION
        It will do stemming for words in x considering english, spanish and portuguese

        INPUT
        x: a tweet text, or other sentense or paragraph

        OUTPUT
        the tweet text after stemming.

        """
        x = x.lower()
        if isinstance(x, unicode) == False:
            x = x.decode('utf-8', 'ignore')
        try:
            stemmer = SnowballStemmer('spanish')
            x1 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem)
            if (x1 == ''):
                x1 = x
            #            print x1
            stemmer = SnowballStemmer('english')
            x2 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem)
            if (x2 == ''):
                x2 = x
            #            print x2
            stemmer = SnowballStemmer('portuguese')
            x3 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem)
            if (x3 == ''):
                x3 = x
            #            print x3
            #        print 'success'
            return min(x1, x2, x3, key=lambda x: len(x))
        except:
            return x
コード例 #17
0
def delete_mark(request):
    args = {'status': 0}
    if request.POST:
        from_entity = request.POST.get('current_entity_id', '')
        to_entity = request.POST.get('to_entity', '')
        project_id = request.POST.get('project_id', '')

        stemmer = SnowballStemmer('russian')
        stem_entity = stemmer.stem(to_entity)

        entities = Entity.objects.filter(project_id=project_id)
        ent = LinkingEntities(entities, None)
        stem_entities = ent.get_stemmed_names_of_entity()
        to_id = 0
        for ent in stem_entities:
            if stem_entity == ent.get("stemmed_name"):
                to_id = ent.get("id")

        unmark_entity = LinksBetweenEntities.objects.filter(from_entity_id=from_entity,
                                                            to_entity_id=to_id,
                                                            is_unmarked=False)
        if unmark_entity:
            data = {'from_entity_id': from_entity,
                    'to_entity_id': to_id,
                    'is_unmarked': True}
            unmark_entity.update(**data)
            args['status'] = 1
            messages.add_message(request, messages.SUCCESS, "Связь успешно удалена")

    return JsonResponse(args)
コード例 #18
0
 def __init__(self,
              max_edit_distance_dictionary: int = 5,
              prefix_length: int = 10,
              count_threshold: int = 1,
              compact_level: int = 5):
     super().__init__(max_edit_distance_dictionary, prefix_length,
                      count_threshold, compact_level)
     self.stemmer = SnowballStemmer('german')
コード例 #19
0
ファイル: taskA.py プロジェクト: eliabisconti/haspeede
def get_stem(lang, sentence):
    stemmer = SnowballStemmer(lang)
    stemmed = ''
    for word in casual_tokenize(sentence):
        word = stemmer.stem(word)
        stemmed = stemmed + word + ' '

    return stemmed
コード例 #20
0
ファイル: emotions.py プロジェクト: ebegoli/AffectiveNLP
def main():
    #get_synsets( parrot_primary)
    stemmer = SnowballStemmer('english')
    print stemmer.stem( "affectionate" )

    print is_emotion( "affection" )
    print is_emotion( "Affection" ) 
    print is_emotion( "haha" )
コード例 #21
0
 def __init__(self):
     # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]')
     self._tok = MosesTokenizer(lang='en')
     self._stemmer = SnowballStemmer('english')
     self._lemmatizer = TreeTagger(language='english')
     self._stopwords = set(open(STOPWORDS).read().splitlines())
     # istopwords.words('french') #
     self._porter_stemmer = nltk.stem.porter.PorterStemmer()
コード例 #22
0
def stemLine(title, abstract):

    snow = SnowballStemmer('english')

    title = [snow.stem(t) for t in title.split()]
    abstract = [snow.stem(a) for a in abstract.split()]

    return (' ').join(title) + '\t' + (' ').join(abstract)
コード例 #23
0
def stem(word):
    '''danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian',
		 'porter', 'portuguese", 'romanian',  'russian', 'spanish', 'swedish')'''
    stemmer = SnowballStemmer("english")
    try:
        word = stemmer.stem(word).encode('utf-8')
    except Exception, e:
        word = word
コード例 #24
0
def preprocess(question, spelling_correction, stemming, featurenames, numbers_feature):
	"""
	preprocess the questions
		question - string containing the text of the question
		spelling_correction - Boolean parameter which decides whether to use spelling correction or not, by default False
		stemming - Boolean parameter whether to do the stemming or not
		featurenames - set of feature names
		numbers_feature - Boolean parameter whether to include an indication about a number in the datapoint

		return:
			valid_questions - list of non-empty questions
			featurenames - list of feature names
	"""
	# make a list with german stop-words
	stop_words = stopwords.words('german')
	stop_words = [i.decode('utf-8') for i in stop_words]
	# create the stemmer
	stemmer = SnowballStemmer("german")
	# create a dictionary of german words for spelling correction
	if spelling_correction:
		german_dict = enchant.Dict("de_DE")
	valid_questions = []
	for i in question:
		# check if the question has a category
		if i[3] == 'N':
			continue
		contains_num = 0
		category_text = i[4]
		if re.search('\d+', category_text):
			contains_num = 1
		# remove the punctuation
		category_text = re.sub(r'[^a-zA-Z ]',' ', category_text)
		# remove the stop words and split questions into words
		category_text = category_text.split()
		category_text = [w for w in category_text if w not in stop_words]
		for k in range(len(category_text)):
			# do the spelling correction, if specified
			if spelling_correction:
				if not german_dict.check(category_text[k]):
					try:
						category_text[k] = german_dict.suggest(category_text[k])[0]
					except:
						pass
			# convert words to lowercase
			category_text[k] = category_text[k].lower()
			# stem the words
			if stemming:
				category_text[k] = stemmer.stem(category_text[k])
			#save the words as features
			if category_text[k]:
				featurenames.add(category_text[k])
		# if the text of the question is not empty, append the question to the list of valid questions
		if category_text:
			i[4] = category_text 
			if numbers_feature:
				i[4] += [contains_num]
			valid_questions.append(i)
	return valid_questions, list(featurenames)
コード例 #25
0
def _remove_pattern_2(input_text_list):
    stoplist = read_stopwords()

    cleaned_text_list = []
    for text in input_text_list:
        text = text.translate(string.punctuation)  # Remove puncuation 去除标点
        text = text.lower()  # Convert words to lower case and split them

        # text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ",
                      text)  # 除A-Za-z0-9(),!?'`外的字符,去除
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text)

        text = text.split()

        text = [word for word in text
                if word not in stoplist]  ## 在提取词根前清除一次停用词

        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]

        cleanwordlist = [
            word for word in stemmed_words if word not in stoplist
        ]  ## 提取词根后,再清除

        text = " ".join(cleanwordlist)

        cleaned_text_list.append(text)
    return cleaned_text_list
コード例 #26
0
def clean_text(text):
    import nltk
    nltk.download('stopwords')
    translate_table = dict((ord(char), None) for char in string.punctuation)
    text = text.translate(translate_table)

    re_url = re.compile(
        r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
                        .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
        re.MULTILINE | re.UNICODE)
    re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

    text = re_url.sub("URL", text)

    text = re_ip.sub("IPADDRESS", text)

    text = text.lower().split()

    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]

    text = " ".join(text)

    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text
コード例 #27
0
ファイル: clean.py プロジェクト: ZhangYW18/AppliedTextMining
def preprocess(tweet):
    stop_words = stopwords.words("english")
    stemmer = SnowballStemmer("english")
    tweet = re.sub(TEXT_CLEANING_RE, ' ', str(tweet).lower()).strip()
    tokens = []
    for token in tweet.split():
        if token not in stop_words:
            tokens.append(stemmer.stem(token))
    return " ".join(tokens)
コード例 #28
0
def stem_tweets(sentence):
    stemmer = SnowballStemmer("english")
    stem_sentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stem_sentence += stem
        stem_sentence += " "
    stem_sentence = stem_sentence.strip()
    return stem_sentence
コード例 #29
0
 def stemm(self, document):
     '''
     document stemming
     '''
     if len(self.tokens) == 0:  #empty list
         self.tokenize(document)
     else:
         spanishstemmer = SnowballStemmer('spanish')
         self.stems = [spanishstemmer.stem(token) for token in self.tokens]
コード例 #30
0
 def findOrigin(self):
     stemmer = SnowballStemmer("english")
     tokenList = word_tokenize(self.text)
     for token in tokenList:
         # Get the current token and stem it
         curStem = stemmer.stem(token)
         # if current token's stemmed version can be found in stemlist
         if curStem in self.stemList and token not in self.stemDict[curStem]:
             self.stemDict[curStem].append(token)
コード例 #31
0
ファイル: SenseModelLearner.py プロジェクト: qiuwei/snlp
 def learn(corpus, epsilon=None):
     # set default smooth parameter
     if(epsilon == None):
         epsilon = 0.1
     prior_sense_dist = {}
     total_num_sense = 0
     word_sense_dist = {}
     sense_sum = {}
     snstmr = SnowballStemmer("english")
     #vocabulary = []
     senses = []
     #count all of the sensense
     for instance in corpus:
         sense = instance.get_sense()
         if sense not in senses:
             senses.append(sense)
     for instance in corpus:
         context = map(lambda x: snstmr.stem(x), instance.get_context().lower().strip().split())
         # count all senses to estimate the distribution of senses
         sense = instance.get_sense()
         total_num_sense += 1
         try:
             prior_sense_dist[sense] += 1
         except KeyError:
             prior_sense_dist[sense] = 1
                 
         #head = instance.get_head_word()
         # count all (word, sense) cooccurece to estimatate the conditional distribution, namely P(word|sense)
         for word in context:
             try:
                 word_sense_dist[(word, sense)] += 1
             #find a new word
             except KeyError:
                 # put it in the matrix
                 word_sense_dist[(word,sense)] = 1 + epsilon
                 # also put it for all other possible senses
                 for s in senses:
                     if s != sense:
                         word_sense_dist[(word, s)] = epsilon
     
     # sum over senses, will be used to calculate the probability
     for k in word_sense_dist.keys():
         try:
             sense_sum[k[1]] += word_sense_dist[k]
         except KeyError:
             sense_sum[k[1]] = 0
             
     for k in word_sense_dist.keys():
         word_sense_dist[k] =  math.log(float(word_sense_dist[k]) / float(sense_sum[k[1]]))
         #print k, word_sense_dist[k]
     #print "========="
     for k in prior_sense_dist.keys():
         prior_sense_dist[k] = math.log(float(prior_sense_dist[k]) / total_num_sense)
         #print prior_sense_dist[k]
         
             
     return (prior_sense_dist,word_sense_dist)
コード例 #32
0
def softClean(text,
              rmPunc=False,
              sentTok=False,
              rmNumber=False,
              stop_words=False,
              stem=False,
              lower_case=False,
              rm_char=False):
    if lower_case:
        text = text.lower()

    cleanText = re.sub(RVM_REPEATED_PUNC, '\\1', text)

    # Remove time
    cleanText = re.sub(r'[0-9]{1,2}:[0-9]{2}:[0-9]{2}', '', cleanText)
    # Remove date
    cleanText = re.sub(r'[0-9]{1,4}([/-])[0-9]{1,2}\1[0-9]{2,4}', '',
                       cleanText)

    if sentTok:
        cleanText = '\n'.join(sent_tokenize(text))

    tokens = word_tokenize(cleanText) if stop_words or stem else None

    if stem:
        stop_word_set = set(stopwords.words('english') +
                            ["n't"]) if stop_words else set()
        stemmer = SnowballStemmer('english', ignore_stopwords=True)
        old_tokens = tokens
        tokens = []

        for token in old_tokens:
            nt = stemmer.stem(token)

            if len(nt) > 0 and nt not in stop_word_set:
                tokens.append(nt)

    elif stop_words:
        stop_word_set = set(stopwords.words('english') + ["n't"])
        tokens = list(filter(lambda token: token not in stop_word_set, tokens))

    if tokens is not None:
        cleanText = ' '.join(tokens)

    if rmPunc:
        cleanText = re.sub(SPLIT_PUNCT_REGEX, ' ', cleanText)
    else:
        cleanText = re.sub(SPLIT_PUNCT_REGEX, ' \\1 ', cleanText)

    if rmNumber:
        cleanText = re.sub(r'\b[0-9]+\b', '', cleanText)

    if rm_char:
        cleanText = re.sub(r'\b\S\b', '', cleanText)

    return cleanText
コード例 #33
0
    def __init__(self, dictionary_loader: DictionaryLoader, stop_words_file: str, termination_terms_file: str,
                 stemmer: StemmerI = None):
        super().__init__(stop_words_file, termination_terms_file, dictionary_loader)
        self.stemmer = stemmer
        self.unigram_stem_index = dict()  # record the stem and give an Id
        self.concept_length_index = dict()  # record the stem and give the length of the expression
        if not stemmer:
            self.stemmer = SnowballStemmer("english")

        self.punctuation_remove = regex.compile('\p{C}', regex.UNICODE)
コード例 #34
0
 def __init__(self):
     self.stemmer = SnowballStemmer("english")
     self.stop_list = set(stopwords.words("english"))
     self.dv = DictVectorizer()
     self.vocab_index = {}
     self.index_counter = 0
     self.doc_index = {}
     self.doc_vectors = {}
     self.vocab_counter = Counter()
     self.loaded_vocab = False
コード例 #35
0
ファイル: parser.py プロジェクト: anafisa/News-Bot
def rss_news(rss, word):
    stemmer = SnowballStemmer("russian")
    w = stemmer.stem(word)
    news = []
    feed = feedparser.parse(rss)
    for post in feed.entries:
        lst = list(map(stemmer.stem, (post.title + post.description).split()))
        if w in lst:
            news.append(f"{post.title} \n{post.description}")
    return news
コード例 #36
0
ファイル: features_function.py プロジェクト: eme-ele/IA-usb
def numero_palabras_positivas(tweet):
	res = 0
	twt = tweet.split()
	try:
		stemmer = SnowballStemmer("spanish")
		for w in twt:
			if stemmer.stem(unicode(w.strip(),'UTF-8')).encode("UTF-8") in positivas_list:
				res += 1
	except:
		pass
	return res
コード例 #37
0
ファイル: server.py プロジェクト: cjauvin/toi
    def query():

        stemmer = SnowballStemmer('english')
        raw_tokens = [w for w in re.split('\W+', request.args['q']) if w.lower() not in set(stopwords.words('english'))]
        tokens = [stemmer.stem(w) for w in raw_tokens]
        token_s2r = dict(zip(tokens, raw_tokens))

        g = Graph()
        g.parse('../PopHR-ToyOnto2.rdf')

        results = []
        matched_tokens = []
        query_type = None

        for n in [3, 2]: # important: begin by most specific query
            if n > len(tokens): continue
            for p in product(['lower', 'higher'], permutations(tokens, n)):
                p = flatten(p)
                p3 = p[3] if n == 3 else None # relation
                results = findLowHighDiseaseRelationships(g, p[0], p[1], p[2], p3)
                if results:
                    query_type = 'disease_relationship'
                    matched_tokens = [token_s2r[w] for w in p[1:]] # first is lower/higher
                    break
            if results: break

        if not results:
            for w in tokens:
                result = findTopAncestorConcept(g, w)
                if result:
                    query_type = 'single_concept'
                    results.append(result)
                    matched_tokens.append(token_s2r[w]) # dont break, there might be another single concepts!

        html_results = []

        if query_type == 'disease_relationship':
            for r in results:
                s = '<p>&gt; %s is a %s in relation %s to %s</p>' % tuple([concept(r[k]) for k in ['lower', 'higher', 'relation', 'disease']])
                html_results.append(s)
        elif query_type == 'single_concept':
             #s = '<p>&gt; %s</p>' % ' and '.join(['%s is a %s' % (concept(r['lower']), concept(r['highest'])) for r in results])
            for r in results:
                s = '<p>&gt; %s is a %s</p>' % tuple([concept(r[k]) for k in ['lower', 'highest']])
                html_results.append(s)
        else:
            html_results.append('<p>&gt; unable to process query</p>')

        html_query = request.args['q']
        for w in matched_tokens:
            html_query = re.sub(w, '<span class="highlighted">%s</span>' % w, html_query)

        return json.dumps({'html_results': html_results, 'html_query': html_query})
コード例 #38
0
 def get_stemmed_names_of_entity(self):  # get stemmed names of entities
     stemmer = SnowballStemmer('russian')
     stemmed_names_in = self.get_names_of_entity()
     stemmed_names_out = []
     for item in stemmed_names_in:
         stemmed = " ".join([stemmer.stem(word.lower()) for word in item.get('name').split(" ")])  # for чтобы
         # отстеммить name_of_entity, состоящий из нескольких слов
         if stemmed.endswith('ок'):
             stemmed = stemmed[:-2] + 'к'
         stemmed_names_out.append({'stemmed_name': str(stemmed).lower(),
                                   'id': item.get('id')})
     return stemmed_names_out
コード例 #39
0
ファイル: MFSTagger.py プロジェクト: qiuwei/snlp
class MFSTagger(object):
    '''
    most frequent sense Tagger, will be used as baseline system
    '''


    def __init__(self, model):
        '''
        Constructor
        '''
        self.prior_sense_dist = model
        self.snstmr = SnowballStemmer("english")
    
    def tag(self, sentence):
        context = map(lambda x: self.snstmr.stem(x), sentence.lower().strip().split())
        predict_sense = "init"
        prob = float('-inf')
        for candi_sense in self.prior_sense_dist.keys():
            #print candi_sense
            current_prob = self.prior_sense_dist[candi_sense]
            if current_prob > prob:
                predict_sense = candi_sense
                #update the probability
                prob = current_prob
        return predict_sense
コード例 #40
0
ファイル: SenseTagger.py プロジェクト: qiuwei/snlp
class SenseTagger(object):
    
    prior_sense_dist = {}
    word_sense_dist = {}
    
    def __init__(self, model):
        self.prior_sense_dist = model[0]
        self.word_sense_dist = model[1]
        self.snstmr = SnowballStemmer("english")
    
    def tag(self, sentence):
        context = map(lambda x: self.snstmr.stem(x), sentence.lower().strip().split())
        predict_sense = "init"
        prob = float('-inf')
        for candi_sense in self.prior_sense_dist.keys():
            #print candi_sense
            current_prob = self.prior_sense_dist[candi_sense]
            #print current_prob
            #print '--------'
            for word in context:
                try:
                    current_prob += self.word_sense_dist[(word, candi_sense)]  
                    #print current_prob
                except KeyError:
                    pass
            #print current_prob, prob
            if current_prob > prob:
                predict_sense = candi_sense
                #update the probability
                prob = current_prob
        
        return predict_sense
コード例 #41
0
class Tokenizer(object):
    """
        For a given language it
         - splits the text into tokens
         - applies snowball stemmer

        Main method = @get_tokens
    """
    def __init__(self, language='russian'):
        self.stopwords = set(stopwords.words(language)).union('. , ? ! ( )'.split())
        self.stemmer = SnowballStemmer('russian')

    def get_tokens(self, s):
        """
            :param s
            :type str

            :return list of str
            list of stemmed tokens
        """
        return map(self._process_token, self._str2tokens(s))

    def _str2tokens(self, s):
        return list(set(word_tokenize(s.lower())).difference(self.stopwords))

    def _process_token(self, token):
        return self.stemmer.stem(token).encode('utf-8')
コード例 #42
0
ファイル: Utils.py プロジェクト: lwheng/fyp
class nltk_tools:
  def __init__(self):
    self.stemmer = SnowballStemmer("english")

  def nltk_word_tokenize(self, text):
    return nltk.word_tokenize(text)

  def nltk_text(self, tokens):
    return nltk.Text(tokens)

  def nltk_text_collection(self, documents):
    return nltk.TextCollection(documents)

  def nltk_stopwords(self):
    return stopwords.words('english')

  def nltk_cosine_distance(self, u, v):
    # Closer to 1 the better
    return nltk.cluster.util.cosine_distance(u,v)

  def nltk_stemmer(self, input_string):
    return self.stemmer.stem(input_string)

  def nltk_pos(self, text):
    return nltk.pos_tag(text)

  def nltk_bigrams(self, text):
    return nltk.bigrams(text)
コード例 #43
0
ファイル: tfidf.py プロジェクト: AmitPoonia/question2macro
def wordlist(body, remove_stopwords=False, stem=False):
    """ convert a document to a sequence of words, optionally removing stop words.  Returns a list of words."""

    # Remove non-letters
    text = re.sub("[^a-zA-Z]", " ", body)

    # convert words to lower case and split them
    words = text.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    if stem:
        stemmer = SnowballStemmer("english")
        words = [stemmer.stem(w) for w in words]

    return words
コード例 #44
0
def mark_as_entity(request):
    args = {'status': 0}
    if request.POST:
        post_project_id = request.POST.get('project_id', '')
        post_entity_name = request.POST.get('entity_name', '')  # to_hovered_word

        # unmarked_entities
        from_entity = request.POST.get('current_entity_id', '')

        stemmer = SnowballStemmer('russian')
        stem_entity = stemmer.stem(post_entity_name)

        entities = Entity.objects.filter(project_id=post_project_id)
        ent = LinkingEntities(entities, None)
        stem_entities = ent.get_stemmed_names_of_entity()
        to_entity = 0
        for ent in stem_entities:
            if stem_entity == ent.get("stemmed_name"):
                to_entity = ent.get("id")

        if from_entity == to_entity:
            pass
        else:
            unmark_entity = LinksBetweenEntities.objects.filter(from_entity_id=from_entity,
                                                                to_entity_id=to_entity)
            if unmark_entity:
                data = {'from_entity_id': from_entity,
                        'to_entity_id': to_entity,
                        'is_unmarked': False}
                unmark_entity.update(**data)
                args['status'] = 1
                messages.add_message(request, messages.SUCCESS, "Слово отмечено как сущность")
            #
            else:  # add new entity
                morph = pymorphy2.MorphAnalyzer()
                new_entity = morph.parse(post_entity_name)[0].normal_form.title()
                args['new_entity'] = new_entity
                args['status'] = 2
                #
                # new_entity = Entity(item_name=post_entity_name, description='', project_id=post_project_id)
                # new_entity.save()
                # args['status'] = 1
    return JsonResponse(args)
コード例 #45
0
def preprocess_data(data):
    """
    :param data: data to be pre-processed
    :return: pre-processed data
    """

    stemmer2 = SnowballStemmer("english") # for removing stem words
    stop_words = text.ENGLISH_STOP_WORDS  # omit stop words

    temp = data
    temp = re.sub("[,.-:/()?{}*$#&]"," ",temp)  # remove all symbols
    temp = "".join([ch for ch in temp if ch not in string.punctuation])  # remove all punctuation
    temp = "".join(ch for ch in temp if ord(ch) < 128)  # remove all non-ascii characters
    temp = temp.lower() # convert to lowercase
    words = temp.split()
    no_stop_words = [w for w in words if not w in stop_words]  # stemming of words
    stemmed_data = [stemmer2.stem(plural) for plural in no_stop_words]

    return stemmed_data
コード例 #46
0
ファイル: text_vector.py プロジェクト: daizhen/ImagesCategory
def CreateTextVector():
    fileName='ocr_result _1.csv'
    wordDict = {};
    csvfile = file(fileName, 'rb')
    reader = csv.reader(csvfile)
    regEx = re.compile('\\W*')
    regNumber = re.compile('^\\d+$')
    regStartNumber = re.compile('^\\d+')
    regEndNumber = re.compile('\\d+$')
    stemmer = SnowballStemmer("english")
    for line in reader:
        if len(line) > 0:
            currentFileName = line[0]
            content = line[1]
            rawWordList = regEx.split(content)
            wordList =[tok.lower() for tok in rawWordList if len(tok) > 2 and len(tok) < 20 and regNumber.match(tok)==None]
            #print wordList
            #print len(wordList)
            for currentWord in wordList:
                wordStem = currentWord
                #Remove numbers begein and end of the wordDict
                startNumberMatch =  regStartNumber.match(wordStem)
                if startNumberMatch != None:
                    wordStem = wordStem[startNumberMatch.span()[1]:]
                endNumberMatch = re.search('\\d+$',wordStem)
                if endNumberMatch != None:
                    wordStem = wordStem[:endNumberMatch.span()[0]]
                wordStem = stemmer.stem(wordStem)
                if len(wordStem) >2 :
                    if wordStem in wordDict:
                        wordDict[wordStem] = wordDict[wordStem] + 1
                    else:
                        wordDict[wordStem] = 1
            #break;
    csvfile.close()
    sortedList = sorted(wordDict.iteritems(), key=lambda d:d[1], reverse = False)
    keys = [[item[0]] for item in sortedList if item[1]>2]
    print len(keys)
    #print keys
    
    # Write the keys to csv file
    CSVUtil.WriteCSV('../../data/all_tokens.csv',keys)
コード例 #47
0
ファイル: tokenizer.py プロジェクト: amitsingh2783/kaggle
def tokenize_and_stem(txt, stem=True, remove_html=True, join=False, remove_stopwords=True):
    ''' Remove html and stopwords, tokenize and stem. '''

    lang = 'english'
    if remove_html:
        txt = clean_html(txt)

    words = tokenize(txt)
    if remove_stopwords:
        stop_words = stopwords.words(lang)
        words = [w for w in words if w.lower() not in stop_words]

    if stem:
        stemmer = SnowballStemmer(lang)
        words = [stemmer.stem(word).encode(encoding="utf8") for word in words]

    if join:
        words = " ".join(words)

    return words
コード例 #48
0
ファイル: stemmize_lists.py プロジェクト: eme-ele/IA-usb
def main():
	parser = optparse.OptionParser()
	parser.add_option('-f', help='archivo de la lista', type='string', dest='in_file')
	parser.add_option('-o', help='archivo de salida', type='string', dest='out_file')
	(opts, args) = parser.parse_args()
	mandatories = ['in_file', 'out_file']
	for m in mandatories:
		if not opts.__dict__[m]:
			print "Falta argumento obligatorio"
			parser.print_help()
			exit(-1)

	lista = open(opts.in_file, 'r')
	output = open(opts.out_file, 'w')

	stemmer = SnowballStemmer("spanish")

	for word in lista:
		output.write(stemmer.stem(unicode(word.strip(),'UTF-8')).encode("UTF-8")+"\n")

	lista.close()
	output.close()
コード例 #49
0
ファイル: stemtransform.py プロジェクト: PiTiLeZarD/mETL
class StemTransform( metl.transform.base.Transform ):

    init = ['language']

    # void
    def __init__( self, language, *args, **kwargs ):

        self.language  = language.lower()
        self.stemmer   = SnowballStemmer( self.language )

        super( StemTransform, self ).__init__( *args, **kwargs )

    # Field
    def transform( self, field ):

        if field.getValue() is None:
            return field

        field.setValue( u' '.join( [ self.stemmer.stem( word ) for word in field.getValue().split() if word != u'' ] ) )
        return field
コード例 #50
0
ファイル: normalizer.py プロジェクト: nicolasteodosio/PySent
class TweetNormalizer(object):
    def __init__(self, language):
        self.language = language
        self.stemmer = SnowballStemmer(language, ignore_stopwords=True)

    def clean_stopwords(self, text):
        # Cleaning portuguese stopwords
        splitted = [i for i in text.split() if i not in stopwords.words(self.language)]
        cleaned_splitted = []

        # Cleaning twitter stopwords
        for word in splitted:
            cleaned_splitted.append(word)

            for twitter_stopword in TWITTER_STOPWORDS:
                if word.startswith(twitter_stopword):
                    cleaned_splitted.remove(word)

        return ' '.join(cleaned_splitted)

    def stem(self, text):
        splitted = text.split()

        for i, word in enumerate(splitted):
            # import ipdb; ipdb.set_trace()
            # unicode_word = word.encode('utf-8')
            stem_word = self.stemmer.stem(unidecode(word))
            splitted[i] = stem_word

        return ' '.join(splitted)

    def normalize(self, text):
        text = self.clean_stopwords(text)
        text = self.stem(text)

        return text
コード例 #51
0
ファイル: SenseTagger.py プロジェクト: qiuwei/snlp
 def __init__(self, model):
     self.prior_sense_dist = model[0]
     self.word_sense_dist = model[1]
     self.snstmr = SnowballStemmer("english")
コード例 #52
0
ファイル: SenseModelLearner.py プロジェクト: qiuwei/snlp
                except KeyError:
                    # put it in the matrix
                    word_sense_dist[(word,sense)] = 1 + epsilon
                    # also put it for all other possible senses
                    for s in senses:
                        if s != sense:
                            word_sense_dist[(word, s)] = epsilon
        
        # sum over senses, will be used to calculate the probability
        for k in word_sense_dist.keys():
            try:
                sense_sum[k[1]] += word_sense_dist[k]
            except KeyError:
                sense_sum[k[1]] = 0
                
        for k in word_sense_dist.keys():
            word_sense_dist[k] =  math.log(float(word_sense_dist[k]) / float(sense_sum[k[1]]))
            #print k, word_sense_dist[k]
        #print "========="
        for k in prior_sense_dist.keys():
            prior_sense_dist[k] = math.log(float(prior_sense_dist[k]) / total_num_sense)
            #print prior_sense_dist[k]
            
                
        return (prior_sense_dist,word_sense_dist)


if __name__ == '__main__':
    snstmr = SnowballStemmer("english")
    print snstmr.stem("stemming")
    
コード例 #53
0
ファイル: stemming.py プロジェクト: sinjax/trendminer-python
import sys
import json
from nltk import SnowballStemmer

if __name__=='__main__':
  for line in sys.stdin:
    try:
      tweet=json.loads(line,strict=False)
    except:
      continue  
    lang=tweet['lang_det']
    tweet['stemmed']=[]
    options={'de': "german", 'en': "english", 'ro': "romanian", "da": "danish", "nl": "dutch", "fi": "finnish", "fr": "french", "hu": "hungarian", "it": "italian", "no": "norwegian", "pt": "portuguese", "ru": "russian", "es": "spanish", "sv": "swedish"}
    try:
      stemmer=SnowballStemmer(options[lang])
    except: 
      print json.dumps(tweet)
      continue
    tokens=tweet['tokens']
    for token in tokens:
      tweet['stemmed'].append(stemmer.stem(token))
    print json.dumps(tweet)