コード例 #1
0
    def embers_stem(x):
        """
        DESCRIPTION
        It will do stemming for words in x considering english, spanish and portuguese

        INPUT
        x: a tweet text, or other sentense or paragraph

        OUTPUT
        the tweet text after stemming.

        """
        x = x.lower()
        if isinstance(x, unicode) == False:
            x = x.decode('utf-8', 'ignore')
        try:
            stemmer = SnowballStemmer('spanish')
            x1 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem)
            if (x1 == ''):
                x1 = x
            #            print x1
            stemmer = SnowballStemmer('english')
            x2 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem)
            if (x2 == ''):
                x2 = x
            #            print x2
            stemmer = SnowballStemmer('portuguese')
            x3 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem)
            if (x3 == ''):
                x3 = x
            #            print x3
            #        print 'success'
            return min(x1, x2, x3, key=lambda x: len(x))
        except:
            return x
コード例 #2
0
def tokenize(text, stemming=True, stoplist=[], remove_digits=False, lang='en'):
    translator = str.maketrans(
        string.punctuation,
        ' ' * len(string.punctuation))  # map punctuation to space
    text = text.translate(translator)
    text = text.lower()
    text = text.strip()
    table = str.maketrans({key: None for key in string.punctuation})
    text = text.translate(table)
    if stemming:
        if lang == 'en':
            stemmer = Stemmer()
        elif lang == 'it':
            stemmer = SnowballStemmer('italian')
        elif lang == 'de':
            stemmer = SnowballStemmer('german')
        elif lang == 'fa':
            stemmer = paStemmer()
        analyzer = StemmingAnalyzer(stoplist=stoplist,
                                    minsize=1,
                                    stemfn=stemmer.stem)
    else:
        analyzer = StandardAnalyzer(stoplist=stoplist, minsize=1)

    tokens = [token.text for token in analyzer(text)]
    if remove_digits:
        tokens = [
            word for word in tokens
            if not contains_digits(word) and 2 <= len(word)
        ]
    return tokens
コード例 #3
0
 def __init__(self, lang):
     s_lang = map_langs.get(lang, lang)
     self.re_digits = re.compile(r"^[0-9]+(?:[,.][0-9]+)*[ºªkKmM]?$")
     self.re_mail = re.compile(r"^[a-zA-Z0-9+_\-\.]+@[0-9a-zA-Z][.-0-9a-zA-Z]*.[a-zA-Z]+$")
     self.re_url = re.compile(
         r"^[A-Za-z0-9-_]+:\/\/[A-Za-z0-9-_]*(?:\.[A-Za-z0-9-_]+)*|[A-Za-z0-9-_]+(?:\.[A-Za-z0-9-_]+)+$")
     if SnowballStemmer.languages.__contains__(s_lang):
         self.stemmer = SnowballStemmer(s_lang)
     else:
         self.stemmer = SnowballStemmer('porter')
     self.mapstem = {}
コード例 #4
0
    def __init__(self, min_occurrence=10, window=15, from_corpus=False):
        self.min_occurrence = min_occurrence
        self.window = window

        # map words to integers (more memory efficient and faster)
        self.word2int_count = count()
        self.word2int = defaultdict(self.word2int_count.__next__)

        # map city names also to ints
        self.city2int_count = count()
        self.city2int = defaultdict(self.city2int_count.__next__)

        self.stemmer = SnowballStemmer('german')
        self.stopwords = set(stopwords.words('german')).union(STOP_CITIES)
        self.stems = defaultdict(lambda: defaultdict(int))

        self.cores = multiprocessing.cpu_count()

        if from_corpus:
            print("loading spacy", file=sys.stderr, flush=True)
            self.nlp = spacy.load('de',
                                  parser=False,
                                  tagger=True,
                                  entity=False)
            print("done...", file=sys.stderr, flush=True)
コード例 #5
0
ファイル: nlp_methods.py プロジェクト: kewilliams86/CSC-450
def stemLine (text):

    snow = SnowballStemmer('english')
    
    text = [snow.stem(t) for t in text.split()]

    return (' ').join(text)
コード例 #6
0
class Config(object):
    # TEXT CLENAING
    TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

    # Checking if GPU is available or not
    is_cuda = torch.cuda.is_available()
    if is_cuda: device = torch.device("cuda")
    else: device = torch.device("cpu")

    # Stop words
    nltk.download('stopwords')
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')

    # Model params
    batch_size = 32  # Batch size
    embed_size = 300  # Word2Vec Embedding size
    hidden_layers = 2  # Number of Hidden layers for Bi-directional LSTM
    hidden_size = 100  # Size of each Hidden layer in LSTM
    output_size = 2  # Output size
    hidden_size_linear = 128  # Fully connected layers
    dropout_keep = 0.51  # Dropout layer probability
    lr = 0.05  # Learning rate
    epochs = 100  # Number of Epochs

    # Directories path
    model_path = ""  # Trained model path state_dict.pt file
    embedding_path = "./Dataset/embedding_matrix.npz"  # Embedding matrix path .npz file
    train_path = "./Dataset/trainset.npz"  # Training data file path .npz
    test_path = "./Dataset/validset.npz"  # Testing data file path .npz
    tokenizer_path = ""  # Tokenizer file path which you can use during inference
    path = "./results/RCNN/0.0005"  # directory path to save results
コード例 #7
0
    def quadratic(cls, language: 'model.Language'):
        """
                This estimator computes the ratio of new words for a given user and language
                :param language: language of the text that needs to be estimated
                :param user: the user for which the difficulty estimation needs to be done
                :rtype: WordHistoryDifficultyEstimator
                :return: WordHistoryDifficultyEstimator with initialized user, language and word => score map
                        which can be used for determining scores for multiple articles for the same user and language
        """

        estimator = cls(language)

        freq_list = load_language_from_hermit(language.code)

        word_dict = dict()
        for k, v in freq_list.word_info_dict.items():
            word_dict[k] = v.frequency

        stemmer = SnowballStemmer(language.name.lower())

        score_map = defaultdict(int)

        for k, v in word_dict.items():
            score_map[stemmer.stem(k.lower())] += v

        max_freq = max(score_map.values())

        for k in score_map.keys():
            score_map[k] = (1 - score_map[k] / max_freq)**0.5

        estimator.score_map = score_map

        return estimator
コード例 #8
0
def bag_of_words_spacy2(dataset):
    import spacy
    nlp = spacy.load('es_core_news_md')
    spanishstemmer = SnowballStemmer("spanish")
    all_stopwords = stopwords.words('spanish')
    all_stopwords.extend(
        ("saludo", "dia", "noche", "noches", "tardes", "buenos", "buenas",
         "atentamente", "dias", "estimado", "estimados", "estimada", "atte",
         "hola", "gracia", "caja", "respuesta", "adjunto", "mucha", "me",
         "cordoba", "buen", "ud"))
    removeList = ["no", "nunca"]
    all_stopwords = [e for e in all_stopwords if e not in removeList]
    corpus = []
    for i, value in dataset.items():
        review = str(html.unescape(dataset[i]))
        review = cleanhtml(review)
        review = re.sub(
            r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+",
            r"\1", normalize("NFD", review), 0, re.I)
        review = normalize('NFC', review)
        review = re.sub('[^a-zA-Zá-ú0-9]', ' ', review)
        review = review.lower()
        doc = nlp(review)
        stems = [
            spanishstemmer.stem(token) for token in doc
            if not token in set(all_stopwords)
        ]
        review = ' '.join(stems)
        corpus.append(review)
    return corpus
コード例 #9
0
def dat_to_db(language: Language, fname: str = "songbd.dat"):
    """
    Builds a .db file out of the file created by pull_only_contents.
    This is really useful for creating a database without using API calls.
    Currently the only officially supported way to create a database.

    :param language: the language to create a database for
    :param fname: the song data to import (generated by pull_only_contents)
    :return: none - will overwrite .db file though
    """

    vocabulary = {}

    with open(fname, "rb") as f:
        songs = pickle.load(f)

    stemmer = SnowballStemmer(language.name)

    for song in songs:
        lyrics = song.lyrics
        lyrics = lyrics.replace("\n", " ")
        lyrics = lyrics.split(" ")
        for word in lyrics:
            if word.isalpha() and not word == "":
                if word not in vocabulary:
                    vocabulary[word] = set()
                vocabulary[word].add(song.name)

    with open(language.file, "wb") as f:
        pickle.dump(vocabulary, f)
コード例 #10
0
def normalizeWords(text):
    ''' Text preprocessing '''
    stemmer = SnowballStemmer(language='english')
    test = re.compile(r'\W+', re.UNICODE).split(text[0].lower())
    stop_words = [
        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
        'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
        'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
        'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
        'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
        'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having',
        'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
        'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
        'with', 'about', 'against', 'between', 'into', 'through', 'during',
        'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in',
        'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
        'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
        'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
        'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
        't', 'can', 'will', 'just', 'don', 'should', 'now', 'html'
    ]
    test = [
        stemmer.stem(word) for word in test
        if not word in stop_words and word.isalpha() and len(word) > 2
    ]
    return (test, text[1], len(test))
コード例 #11
0
 def sentence_stemming(sentence):
     options = {
         "ar": "arabic",
         "da": "danish",
         "nl": "dutch",
         "en": "english",
         "fi": "finnish",
         "fr": "french",
         "de": "german",
         "hu": "hungarian",
         "it": "italian",
         "no": "norwegian",
         "pt": "portuguese",
         "ro": "romanian",
         "ru": "russian",
         "es": "spanish",
         "sw": "swedish"
     }
     c = detect(sentence)
     try:
         stemmer = SnowballStemmer(options[c])
     except KeyError:
         print("Language not supported")
         sys.exit()
     s = "".join(stemmer.stem(i) + " " for i in sentence.split())
     return "".join(s + " " for s in word_tokenize(s)
                    if s not in set(stopwords.words(options[c])))
コード例 #12
0
def words_stemmer(words,
                  type="PorterStemmer",
                  lang="english",
                  encoding="utf8"):
    supported_stemmers = [
        "PorterStemmer", "LancasterStemmer", "SnowballStemmer"
    ]
    if type is False or type not in supported_stemmers:
        return words
    else:
        stem_words = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))

        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))

        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))
        return " ".join(stem_words)
コード例 #13
0
def stemming(words_list,
             type="PorterStemmer",
             lang="english",
             encoding="utf8"):
    """Function stems all words with stemmer type
    Args:
        word_list  : list of words
    Return:
        The return value. Encoded list of words
    """
    supported_stemmers = [
        "PorterStemmer", "SnowballStemmer", "LancasterStemmer",
        "WordNetLemmatizer"
    ]
    if type is False or type not in supported_stemmers:
        return words_list
    else:
        encoded_list = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words_list:
                encoded_list.append(stemmer.stem(word).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words_list:
                encoded_list.append(stemmer.stem(word).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words_list:
                encoded_list.append(stemmer.stem(word).encode(encoding))
        if type == "WordNetLemmatizer":
            wnl = WordNetLemmatizer()
            for word in words_list:
                encoded_list.append(wnl.lemmatize(word).encode(encoding))
        return encoded_list
コード例 #14
0
 def target_stemming_spanish(self, words):
     result = ""
     wordset = words.split(" ")
     stemmer = SnowballStemmer('spanish')
     for word in wordset:
         result += stemmer.stem(word) + "_"
     return result
コード例 #15
0
def bag_of_words_spacy(dataset):
    import spacy
    spanishstemmer = SnowballStemmer("spanish")
    nlp = spacy.load('es_core_news_md')
    nlp.Defaults.stop_words |= {
        "saludo", "dia", "noche", "noches", "tardes", "buenos", "buenas",
        "atentamente", "dias", "hola", "estimado", "estimados", "estimada",
        "atte"
    }
    nlp.Defaults.stop_words -= {"no", "nunca"}
    corpus = []
    for i, value in dataset.items():
        review = str(html.unescape(dataset[i]))
        review = cleanhtml(review)
        doc = nlp(review)
        words = [t.orth_.lower() for t in doc if not t.is_punct | t.is_stop
                 ]  #elimina signos de puntuacion y stopwords
        #lexical_tokens = [t.lower() for t in words if len(t) > 2 and t.isalpha()] # pasa a minuscula, elimina pal de 2letras y num
        review = ' '.join(words)
        doc = nlp(review)
        lemmas = [tok.lemma_.lower() for tok in doc]
        stems = [spanishstemmer.stem(token) for token in lemmas]
        review = ' '.join(stems)
        corpus.append(review)
    return corpus
コード例 #16
0
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
    supported_stemmers = [
        "PorterStemmer", "SnowballStemmer", "LancasterStemmer",
        "WordNetLemmatizer"
    ]
    if type is False or type not in supported_stemmers:
        return words_l
    else:
        l = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "WordNetLemmatizer":  #TODO: context
            wnl = WordNetLemmatizer()
            for word in words_l:
                l.append(wnl.lemmatize(word).encode(encoding))
        return l
 def __init__(self):
     super(DBRDPreprocessing, self).__init__(
         MultiLineTokenizer(),
         SnowballStemmer('english', ignore_stopwords=True),
         set(
             stopwords.words('english') + list(string.punctuation) +
             ["n't", "'t"]), [HTMLSymbolFilter()])
コード例 #18
0
ファイル: common.py プロジェクト: Overhaug/HuJuRecSys
def stemming_and_stopwords(text):
    stemmer = SnowballStemmer("english")
    stop = stopwords.words("english")
    text = text.apply(lambda x: x.split())
    # text = text.apply(lambda word_list: [w for w in word_list if w not in stop])
    return text.apply(lambda word_list: " ".join(
        [stemmer.stem(w) for w in word_list if w not in stop]))
コード例 #19
0
def stem_text(text: str, lang_code: str) -> [str]:
    if lang_code in languages.languages.keys():
        tokens = word_tokenize(text)
        stemmer = SnowballStemmer(languages.languages[lang_code])
        stems = [stemmer.stem(token) for token in tokens]
        return stems
    return []
コード例 #20
0
def _create_stemmer(stemmer_type):
    """ Initialize a stemmer """
    return {
        'Porter': PorterStemmer(),
        'Snowball': SnowballStemmer('english'),
        'Lancaster': LancasterStemmer(),
    }[stemmer_type]
コード例 #21
0
 def __init__(self, lang):
     lang_ipa = {'es': 'spa-Latn', 'en': 'eng-Latn'}
     lang_stemm = {'es': 'spanish', 'en': 'english'}
     self.lang = lang
     self.stemmer = SnowballStemmer(language=lang_stemm[lang])
     self.epi = epitran.Epitran(lang_ipa[lang])
     self.nlp = self.load_sapcy(lang)
コード例 #22
0
ファイル: pokeapi_old.py プロジェクト: saulcov/PokeWebApp_old
 def Snowball(self):
     newTokens = []
     for t in self.rm_stopwords():
         x = SnowballStemmer('english').stem(t)
         if x not in newTokens:
             newTokens.append(x)
     return newTokens
コード例 #23
0
    def text_process(self, text):
        # Remove punctutation
        no_punc = [
            char.lower() for char in text if char not in string.punctuation
        ]
        # Join the characters again to form the string.
        no_punc = ''.join(no_punc)
        # Remove any stopwords
        try:
            no_stopwords = [
                word for word in no_punc.split()
                if word.lower() not in stopwords.words(self.language)
            ]
        except LookupError:
            nltk.download('stopwords')
            no_stopwords = [
                word for word in no_punc.split()
                if word.lower() not in stopwords.words(self.language)
            ]
        result = no_stopwords

        if self.tagging:
            # Tag each word
            tagged_words = self._tag_text(result)
            # Remove unwanted tags
            extracted_tags = self._extract_tags(tagged_words)
            result = extracted_tags

        if self.stemming:
            # Stem it
            stemmer = SnowballStemmer(self.language)
            result = [stemmer.stem(word) for word in result]

        return result
コード例 #24
0
 def __init__(self,
              max_edit_distance_dictionary: int = 5,
              prefix_length: int = 10,
              count_threshold: int = 1,
              compact_level: int = 5):
     super().__init__(max_edit_distance_dictionary, prefix_length,
                      count_threshold, compact_level)
     self.stemmer = SnowballStemmer('german')
コード例 #25
0
ファイル: taskA.py プロジェクト: eliabisconti/haspeede
def get_stem(lang, sentence):
    stemmer = SnowballStemmer(lang)
    stemmed = ''
    for word in casual_tokenize(sentence):
        word = stemmer.stem(word)
        stemmed = stemmed + word + ' '

    return stemmed
コード例 #26
0
 def __init__(self):
     # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]')
     self._tok = MosesTokenizer(lang='en')
     self._stemmer = SnowballStemmer('english')
     self._lemmatizer = TreeTagger(language='english')
     self._stopwords = set(open(STOPWORDS).read().splitlines())
     # istopwords.words('french') #
     self._porter_stemmer = nltk.stem.porter.PorterStemmer()
コード例 #27
0
def stem(word):
    '''danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian',
		 'porter', 'portuguese", 'romanian',  'russian', 'spanish', 'swedish')'''
    stemmer = SnowballStemmer("english")
    try:
        word = stemmer.stem(word).encode('utf-8')
    except Exception, e:
        word = word
コード例 #28
0
def stemLine(title, abstract):

    snow = SnowballStemmer('english')

    title = [snow.stem(t) for t in title.split()]
    abstract = [snow.stem(a) for a in abstract.split()]

    return (' ').join(title) + '\t' + (' ').join(abstract)
コード例 #29
0
	def __init__(self):
		self.tokenize=RegexpTokenizer(r'\b([A-Za-z]+)\b') #remove the punctuations
		if ver==2:
			self.stemmer = SnowballStemmer("english")         #using stemmed version of words
		elif ver==1:
			self.stemmer = LancasterStemmer()	
		else:
			self.stemmer = PorterStemmer()
コード例 #30
0
def _remove_pattern_2(input_text_list):
    stoplist = read_stopwords()

    cleaned_text_list = []
    for text in input_text_list:
        text = text.translate(string.punctuation)  # Remove puncuation 去除标点
        text = text.lower()  # Convert words to lower case and split them

        # text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ",
                      text)  # 除A-Za-z0-9(),!?'`外的字符,去除
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text)

        text = text.split()

        text = [word for word in text
                if word not in stoplist]  ## 在提取词根前清除一次停用词

        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]

        cleanwordlist = [
            word for word in stemmed_words if word not in stoplist
        ]  ## 提取词根后,再清除

        text = " ".join(cleanwordlist)

        cleaned_text_list.append(text)
    return cleaned_text_list