Esempio n. 1
0
class Index:
    """ Inverted index datastructure """
    def __init__(self, tokenizer=None, stemmer=None, stopwords=None):
        """
        Parameters
        ----------
        tokenizer : None
            NLTK compatible tokenizer function
        stemmer : None
            NLTK compatible stemmer
        stopwords : list
            list of ignored words
        """
        self.tokenizer = nltk.word_tokenize
        self.stemmer = EnglishStemmer()
        self.index = defaultdict(list)
        self.documents = {}
        self.__unique_id = 0
        if not stopwords:
            self.stopwords = set()
        else:
            self.stopwords = set(nltk.corpus.stopwords.words("english"))

    def lookup(self, word: str):
        """
        Lookup a word in the index

        Parameters
        ----------
        word : str
            words
        """
        word = word.lower()
        if self.stemmer:
            word = self.stemmer.stem(word)

        return [self.documents.get(id, None) for id in self.index.get(word)]

    def add(self, document):
        """
        Add a document string to the index

        Parameters
        ----------
        document : str
            document

        """
        for token in [t.lower() for t in nltk.word_tokenize(document)]:
            if token in self.stopwords:
                continue

            if self.stemmer:
                token = self.stemmer.stem(token)

            if self.__unique_id not in self.index[token]:
                self.index[token].append(self.__unique_id)

        self.documents[self.__unique_id] = document
        self.__unique_id += 1
Esempio n. 2
0
 def stem_wrapper(_input=None):
     stemmer = EnglishStemmer()
     result = func(_input) if _input else func()
     if isinstance(result, list):
         if isinstance(result[0], tuple):
             for i in range(len(result)):
                 result[i] = list(result[i])
                 result[i][0] = stemmer.stem(result[i][0])
                 result[i] = tuple(result[i])
             return result
         return [stemmer.stem(word) for word in result]
Esempio n. 3
0
def fix_lemma_problem(pred_scores, targets, space):
    from nltk.stem.snowball import EnglishStemmer
    es = EnglishStemmer()
    r = pred_scores.copy()
    lemmas = np.array([es.stem(v) for v in space.vocab])
    for i, t in enumerate(targets):
        g = es.stem(space.vocab[t])
        mask = (lemmas == g)
        #print space.vocab[t], np.sum(mask)
        r[i][mask] = -1e9
        #print r[i][mask]
    return r
Esempio n. 4
0
class Stemmer_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = EnglishStemmer()
        return

    def fit(self, mots, y=None):
        return self

    def transform(self, mots, y=None):
        for i in range(len(mots)):
            for mot in mots[i]:
                self.stemmer.stem(mot)
        return mots
Esempio n. 5
0
def similarity_score(word1, word2):
    """ see sections 2.3 and 2.4 of http://dx.doi.org.ezp-prod1.hul.harvard.edu/10.1109/TKDE.2003.1209005
    :type word1: string
    :type word2: string
    :return: float: between 0 and 1; similarity between two given words
    """
    stemmer = EnglishStemmer()
    if stemmer.stem(word1) == stemmer.stem(word2):
        return 1
    alpha = 0.2
    beta = 0.6
    l, h = get_path_length_and_subsumer_height(word1, word2)
    return exp((-1)*alpha*l)*((exp(beta*h)-exp((-1)*beta*h))/(exp(beta*h)+exp((-1)*beta*h)))
Esempio n. 6
0
def similarity_score(word1, word2):
    """ see sections 2.3 and 2.4 of http://dx.doi.org.ezp-prod1.hul.harvard.edu/10.1109/TKDE.2003.1209005
    :type word1: string
    :type word2: string
    :return: float: between 0 and 1; similarity between two given words
    """
    stemmer = EnglishStemmer()
    if stemmer.stem(word1) == stemmer.stem(word2):
        return 1
    alpha = 0.2
    beta = 0.6
    l, h = get_path_length_and_subsumer_height(word1, word2)
    return exp((-1) * alpha * l) * ((exp(beta * h) - exp(
        (-1) * beta * h)) / (exp(beta * h) + exp((-1) * beta * h)))
Esempio n. 7
0
def stem_terms_list(terms_list: list) -> list:
    """stems the list of terms(terms) using snowball stemmer
    Update the terms_list and also returns same updated list"""
    stemmer = EnglishStemmer()
    for i in range(terms_list.__len__()):
        terms_list[i] = stemmer.stem(terms_list[i])
    return terms_list
Esempio n. 8
0
    def fit(self, X, D, seq_length=None, wgt=1, wgt_inverse=0):

        max_seq_length = 0
        word_counts = defaultdict(lambda: 0)
        doc_counts = defaultdict(lambda: 1)
        for doc in X:
            sl = 0
            tokens = self.tokenize(doc)

            for w in set(tokens):
                doc_counts[w] += 1

            for w in tokens:
                sl += 1
                word_counts[w] += 1
            max_seq_length = sl if sl > max_seq_length else max_seq_length

        self.max_seq_length = max_seq_length

        word_counts = dict(word_counts)
        word_counts = sorted(word_counts.items(), reverse=True, key=operator.itemgetter(1))

        doc_words = doc_counts.keys()

        doc_ids = np.array(list(doc_counts.values())) / float(X.shape[0])
        self.doc_counts = dict(zip(doc_words, doc_ids))
        self.doc_counts["__PADDING__"] = 0
        self.doc_counts["__OOV_WORD__"] = 0

        self.word_index = dict()
        self.word_index["__PADDING__"] = self.pad_id
        self.word_index["__OOV_WORD__"] = self.oov_id

        self.word_counts = dict()
        for i, (w, c) in enumerate(word_counts):
            self.word_index[w] = i + 1 + self.base_word_id
            self.word_counts[i + 1 + self.base_word_id] = c

        self.inverse_word_index = {v: k for k, v in self.word_index.items()}

        from nltk.stem.snowball import EnglishStemmer
        stemmer = EnglishStemmer()
        V = []
        for i, x in enumerate(X):
            deplist = df["dependency_path"].values[i]
            tokens = word_tokenize(x)

            if seq_length is not None:
                v = np.ones(seq_length) * wgt_inverse
            else:
                v = np.ones(self.max_seq_length) * wgt_inverse

            for j, w in enumerate(tokens):
                if stemmer.stem(w.lower()) in deplist or w.lower() in deplist:
                    v[j] = wgt
            V.append(v)

        self.weights = np.array(V)

        return V, self.weights
Esempio n. 9
0
class Tokenizer(object):
    def __init__(self):
        self.cache = {}
        self.r_stemmer = RussianStemmer()
        self.e_stemmer = EnglishStemmer()

    def process_word(self, w):
        if w in self.cache:
            return self.cache[w]
        else:
            struct = check_structure(w)
            if struct == 'TRASH':
                w_proc = ''
            elif struct == 'WORD':
                if is_ascii(w):
                    w_proc = self.e_stemmer.stem(w)
                else:
                    w_proc = self.r_stemmer.stem(w)
            elif struct == 'NUMBER':
                w_proc = ''
            elif struct == 'COMPLEX':
                w_proc = w
            self.cache[w] = w_proc
            return w_proc

    def tokenize(self, text):
        text = preprosess_text(text)
        words = text.split(' ')
        tokens = []
        for w in words:
            tokens.append(self.process_word(w))
        tokens = [t for t in tokens if len(t)]
        return tokens
Esempio n. 10
0
def Granularity(sentenceArray):
    for sentence in sentenceArray:
        # print(sentence)
        try:

            stemmer = EnglishStemmer()
            sentence = re.sub(r'\#.*?$', '', sentence)
            sentence = re.sub(r'\#.*? ', '', sentence)
            sentence = re.sub(r'\@.*?$', '', sentence)
            sentence = re.sub(r'\@.*? ', '', sentence)
            sentence = re.sub(r'pic.twitter.*?$', '', sentence)
            sentence = re.sub(r'pic.twitter.*? ', '', sentence)
            sentence = re.sub(r'\'m', ' am', sentence)
            sentence = re.sub(r'\'d', ' would', sentence)
            sentence = re.sub(r'\'ll', ' will', sentence)
            sentence = re.sub(r'\&', 'and', sentence)
            sentence = re.sub(r'don\'t', 'do not', sentence)

            data = stemmer.stem(sentence)
            print(data)
            from nltk.corpus import stopwords

            sentence = str(data)
            stop = stopwords.words('english')
            final = [i for i in sentence.split() if i not in stop]
            finalstring = ' '.join(final)
            os.system("printf \"" + str(finalstring) + "\n\">> stemstop/" + word)
        except Exception as e:
            print(e)
Esempio n. 11
0
def query(word):
    db = MySQLdb.connect("127.0.0.1","dizing","ynr3","dizing" )
    cursor=db.cursor()
    snowball_stemmer = EnglishStemmer()
    stem2 = snowball_stemmer.stem(word)
    cursor.execute("SELECT * FROM words WHERE original=%s OR stem1=%s OR stem2=%s", (word,word,stem2))
    rows = cursor.fetchall()
    words1 = dict()
    words2 = dict()
    for row in rows:
        if row[1] == word or row[3]==word:
            words1[word] = row[0]
        else:
            words2[word] = row[0]
    scenes1 = []
    scenes2 = []
    for (i,words_dict) in [(1,words1), (2,words2)]:
        wids = words_dict.values()
        for wid in wids:
            sql = "SELECT s.sentence, s.start, s.stop, s.ready, m.title FROM scene AS s, words_scenes AS ws, movie as m " + \
                           "WHERE ws.wid=%d AND ws.sid=s.sid AND s.mid = m.mid" % int(wid)
            # print sql
            cursor.execute(sql)
            rows = cursor.fetchall()
            if (i==1): scenes1 += rows
            else: scenes2 += rows
    print scenes1
    print scenes2
    return scenes1 + scenes2
    db.close()
Esempio n. 12
0
def etape3_bis_get_counterWord2_with_sw_and_stemmer(data, stopword):
    '''Fonction qui fait un comptage par post avec prise en compte des stopwords
    /!\ On ajoute le stemmer
    INPUT:
    ------
        - data
    OUTPUT:
    -------
        - data + colonne de counter (solution choisie)
        OU
        - liste des counter (en stand by : a voir pb de perfs)
    APPEL FONCTION:
    ---------------
        - etape3(df2, sw)
    '''
    from nltk.stem.snowball import EnglishStemmer
    stemmer = EnglishStemmer()

    data_text = data['Body_clean']

    # On initialise la nouvelle feature
    data['Counter_WORD2'] = None

    for m_id, m_text in data_text.iteritems():
        # On liste les tokens
        tokens = tokenizer.tokenize(m_text.lower())

        # On sélectionne les token qui ne sont pas des stopWords
        # et on met à la jour la nouvelle colonne
        # data.at[m_id, 'Counter_WORD2'] =
        #      [w for w in tokens if not w in list(stopword)]
        data.at[m_id, 'Counter_WORD2'] = [stemmer.stem(w)
            for w in tokens if w not in list(stopword)]
Esempio n. 13
0
 def tokenize(self, file):
     stemmer = EnglishStemmer()
     # start_time = time.time()
     url = open(file, encoding='utf-8')
     html = url.read()
     soup = BeautifulSoup(html, 'lxml')
     for script in soup(['script', 'style']):
         script.extract()
     word_weights = defaultdict(int)
     # total_words = len(soup.get_text())
     # Index position: index in tokens, creation of auxiliary
     for i in soup.find_all(
         ['title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'strong', 'p']):
         tokens = set()
         for t in re.findall(r'\w+', i.text):
             if len(t) >= 2:
                 tokens.add(t.lower())
         weight = [1, 4, 10]
         for token in tokens:
             token = stemmer.stem(token)
             if i.name == 'title':
                 word_weights[token] += weight[2]
             elif i.name == 'strong' or i.name == 'b' or i.name[0] == 'h':
                 word_weights[token] += weight[1]
             else:
                 word_weights[token] += weight[0]
     url.close()
     # print(time.time() - start_time, "seconds")
     return word_weights
Esempio n. 14
0
def generate_tokens(document_text):
    """Process text so it can be added to the index. The NLTK module provides a 
    tokenizer, word stemmer, and a list of stopwords."""

    stemmer = EnglishStemmer() # from NLTK module

    try:
        tokens = [additional_pruning(remove_hyphens(stemmer.stem(token.lower()))) 
                  for token in nltk.word_tokenize(document_text) if len(token) > 2 and token]
    except Exception as e:
        print('error: {}\ndocument_text: {}'.format(e, document_text))
        tokens = []
    good_tokens = []
    
    for token in tokens:
        good_tokens.extend([unidecode(item) for item in process_url_prefixes(token)])
        good_tokens.extend([custom_youtube_stemmer(item) for item in process_url_prefixes(token) 
                            if custom_youtube_stemmer(item)])

    better_tokens = []
    for token in good_tokens:
        if has_alpha_char(token):
            better_tokens.extend(process_url_ends(token))  # can you use list comprehension with extend?

    return better_tokens
Esempio n. 15
0
def getStem(*words):
    '''Return a list of the stems corresponding to the words provided as arguments'''
    stemmer = EnglishStemmer()
    out = list()
    for word in words:
        out.append(stemmer.stem(word))
    return out
Esempio n. 16
0
class StemmedCountVectorizer(CountVectorizer):
    def __init__(self,
                 lang,
                 strip_accents=None,
                 ngram_range=(1, 1),
                 max_df=1.0,
                 min_df=1,
                 stop_words=None):

        if lang == 'de':
            self.stemmer = GermanStemmer()
        else:
            self.stemmer = EnglishStemmer()

        super(self.__class__, self).__init__(stop_words=stop_words,
                                             strip_accents=strip_accents,
                                             ngram_range=ngram_range,
                                             max_df=max_df,
                                             min_df=min_df)

    def _stem_tokens(self, words):
        return [self.stemmer.stem(w) for w in words]

    def build_analyzer(self):
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()

        return lambda doc: self._word_ngrams(
            self._stem_tokens(tokenize(preprocess(self.decode(doc)))),
            stop_words)
Esempio n. 17
0
def getAllStemEntities(entities):
    st = EnglishStemmer()
    q = [",", ".", "!", "?", ":", ";"]
    tmp = []
    sourceEntities = [x for x in entities if len(x) > 0]
    np.random.shuffle(entities)

    for i in xrange(len(entities)):
        if len(entities[i]) == 0:
            continue
        if i % 1000 == 0:
            print i
        entities[i] = entities[i].lower()
        entities[i] = entities[i].replace(" - ", " \u2013 ", entities[i].count(" - "))
        entities[i] = entities[i].replace(" -", " \u2013", entities[i].count(" -"))
        entities[i] = entities[i].replace("- ", "\u2013 ", entities[i].count("- "))
        entities[i] = entities[i].replace("-", " - ", entities[i].count("-"))
        entities[i] = entities[i].replace(")", " )", entities[i].count(")"))
        entities[i] = entities[i].replace("(", "( ", entities[i].count("("))
        entities[i] = entities[i].replace("\u0027", " \u0027", entities.count("\u0027"))
        for w in q:
            entities[i] = entities[i].replace(w, " " + w, entities[i].count(w))
        word = entities[i].split(" ")
        s = ""
        for w in word:
            s += st.stem(unicode(w)) + " "
        tmp.append(s[:-1])
        if len(tmp) > 50:
            break

    return tmp, entities[: len(tmp)]
Esempio n. 18
0
    def get_unique_words_by_stemming(self):
        """Compare a starting word and a group of user submitted words.
           Using stemming to determing similarity.
           Return a list unique words, with matching user.
        """
        stemmer = EnglishStemmer()

        #Make starting list
        list_of_words = list(self.user_submissions.values())
        #Lower case all words for comparison
        list_of_words = [word.lower() for word in list_of_words]
        #Add starting word to list.
        list_of_words.append(self.prompt)
        #Remove identical unique_words.
        counter_of_words = Counter(list_of_words)
        list_of_words = []
        for word in counter_of_words:
            if counter_of_words[word] == 1:
                list_of_words.append(word)

        stemmed_dict = dict()
        for word in list_of_words:
            stemmed_dict[word] = stemmer.stem(word)

        unique_words = [key for key, value in stemmed_dict.items()
                        if list(stemmed_dict.values()).count(value) == 1]
        if self.prompt in unique_words:
            unique_words.remove(self.prompt)
        return unique_words
Esempio n. 19
0
def getAllStemEntities(entities):
    st = EnglishStemmer()
    q = [',', '.', '!', '?', ':', ';']
    tmp = []
    sourceEntities = [x for x in entities if len(x)>0]
    np.random.shuffle(entities)

    for i in xrange(len(entities)):
        if len(entities[i]) == 0:
            continue
        if i % 1000 == 0:
            print i
        entities[i] = entities[i].lower()
        entities[i] = entities[i].replace(' - ', ' \u2013 ', entities[i].count(' - '))
        entities[i] = entities[i].replace(' -', ' \u2013', entities[i].count(' -'))
        entities[i] = entities[i].replace('- ', '\u2013 ', entities[i].count('- '))
        entities[i] = entities[i].replace('-', ' - ', entities[i].count('-'))
        entities[i] = entities[i].replace(')', ' )', entities[i].count(')'))
        entities[i] = entities[i].replace('(', '( ', entities[i].count('('))
        entities[i] = entities[i].replace('\u0027', ' \u0027', entities.count('\u0027'))
        for w in q:
            entities[i]=entities[i].replace(w, ' '+w, entities[i].count(w))
        word = entities[i].split(' ')
        s = ''
        for w in word:
            s  += st.stem(unicode(w)) + ' '
        tmp.append(s[:-1])
        if len(tmp) > 50:
            break

    return tmp, entities[:len(tmp)]
Esempio n. 20
0
def stem_tokenizer(text):
    stemmer = EnglishStemmer(ignore_stopwords=True)
    # Removing the URL links
    words = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    words = re.sub(r"[^A-Za-z0-9\-]", " ", words).lower().split()
    words = [stemmer.stem(word) for word in words]
    return words
Esempio n. 21
0
def stem_docs(docs):
    """Return stemmed documents"""
    stemmer = EnglishStemmer()
    return [
        " ".join([stemmer.stem(word) for word in sentence.split(" ")])
        for sentence in docs
    ]
Esempio n. 22
0
def _tokenize_sentences(text):
    '''
    Tokenize sentences by performing the following:
        - convert to uniform case (lower)
        - numeric removal
        - punctuation removal
        - word stemming
        - stop word removal

    Token lists are converted to token strings for hashability
    '''
    original_sentences = sent_tokenize(text)
    stops = set(stopwords.words('english'))
    # Sentences to lower case
    tokenized_sentences = list(map(lambda s: s.lower(), original_sentences))
    # Remove numbers
    regex = re.compile(r"[0-9]+")
    tokenized_sentences = [regex.sub("", sentence)
                           for sentence in tokenized_sentences]
    ## Strip all punctuation
    regex = re.compile(str.format('([{0}])+', re.escape(punctuation)))
    tokenized_sentences = [regex.sub(" ", sentence)
                           for sentence in tokenized_sentences]
    # Strip stop words
    tokenized_sentences = list(map(lambda s: filter(lambda w: w not in stops, s.split()), tokenized_sentences))
    ## Stem the sentences
    stemmer = EnglishStemmer()
    tokenized_sentences = [
        [stemmer.stem(word) for word in sentence] for sentence in tokenized_sentences]

    # Join the sentences back into strings...
    tokenized_sentences = [' '.join(lst) for lst in tokenized_sentences]
    return _merge_sentences(original_sentences, tokenized_sentences)
Esempio n. 23
0
def unsupervised_predict(post, lda_tags_df_scaled, dictionary, lda):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    stemmer = EnglishStemmer()
    stemmed = ' '.join(stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) for w in w_tokenizer.tokenize(post))

    pattern = re.compile('[^A-Za-z +]')
    normalized = re.sub(pattern, ' ', stemmed)

    result = []
    for token in gensim.utils.simple_preprocess(normalized):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(token)
    this_dictionary = []
    this_dictionary.append(result)
    
    other_corpus = [dictionary.doc2bow(text) for text in this_dictionary]
    unseen_doc = other_corpus[0]
    vector = lda[unseen_doc]

    topic = vector[0][0][0]
    perc = vector[0][0][1]
    tags = lda_tags_df_scaled[int(topic)]
    tags_output = tags.sort_values(ascending=False).head(5)    
    
    return tags_output
Esempio n. 24
0
def clean(toClean):
    """ This function cleans the text before adding it to the JSON.
    It operates by using regex to remove non-alphabetical charecters
    and extra spcaces, then uses NLTK to remove stop words

    Arguments:
        toClean {str} -- A string of text to be cleaned
    Returns:
        {str} -- A cleaned string of text
    """

    stemmer = EnglishStemmer()
    stopWords = set(stopwords.words('english'))
    clean = ""

    # Force lowercase
    toClean = toClean.lower()

    # remove non alphabetical chars and remove extra spaces
    toClean = re.sub('[^a-z,.!?]', ' ',
                     toClean)  # use [^a-zA-Z] if not forced lowercase
    # Replaces sequential spaces with single space
    toClean = re.sub(' +', ' ', toClean)

    # tokenize words for stemming and stop word removal
    tokens = word_tokenize(toClean)

    # for each token, stem token and then check if it is stop word.
    # adds token to the clean text if it is not a stop word.
    for token in tokens:
        token = stemmer.stem(token)
        if token not in stopWords:
            clean = clean + (str(token) + " ")

    return clean
def filter_misc(series, pos=None, stem=False):
    new_series = pd.Series(index=series.index)

    if stem == True:
        # does not stem stopwords
        stemmer = EnglishStemmer(ignore_stopwords=True)

    if pos is not None:
        for index, text in enumerate(series):
            new_series.iloc[index] = ' '.join([
                y for y, tag in nltk.pos_tag(nltk.word_tokenize(text))
                if tag in pos
            ])
        pos_flag = True
    else:
        pos_flag = False

    if stem is True:
        # if both pos and stem
        if pos_flag == True:
            use_series = new_series
        # just stem
        else:
            use_series = series
        for index, text in enumerate(use_series):
            text_list = text.split()
            stemmed_text = []
            for word in text_list:
                stemmed_text += [stemmer.stem(word)]
            new_series.iloc[index] = ' '.join(stemmed_text)

    return new_series
Esempio n. 26
0
 def process_text(self, text, k):
     es = EnglishStemmer()
     doc = "".join([
         es.stem(w.lower()) for w in text.split()
         if w.lower() not in (stopwords.words('english'))
     ])
     return [doc[index:index + k] for index in range(len(doc) - k + 1)]
Esempio n. 27
0
def pre_proc(in_str, removestop=True, alwayskeep=False, word_punc=False, unquote=False):
    # remove accents, wordify punctuation
    in_str = strip_accents(in_str, wordify=word_punc, unquote=unquote)
    en_stem = EnglishStemmer()
    # tokenize string
    if removestop:  # remove stop words
        tok_list = filter(lambda x: x not in stopwords.words('english'), wordpunct_tokenize(in_str))
    else:
        tok_list = wordpunct_tokenize(in_str)
    new_tok_list = []
    for tok in tok_list:
        if tok not in WORD_PUNC_LIST:
            correct_spell = HOBJ.spell(tok)
            if not correct_spell:
                suggestions = [strip_accents(tmp_sug).lower() for tmp_sug in HOBJ.suggest(tok)]
            else:
                suggestions = []
            if correct_spell or (tok.lower() in suggestions):
                new_tok_list.append(tok)
                tok_stem = en_stem.stem(tok)
                if tok_stem != tok:
                    new_tok_list.append(tok_stem)
            elif len(tok) >= 3:
                tok_sug = None
                lev_perc = .34
                for sug in suggestions:
                    if not tok_sug and tok == sug[1:]:
                        tok_sug = sug
                if not tok_sug:
                    for sug in suggestions:
                        tmp_lev_perc = float(lev_dist(tok, sug)) / float(max(len(tok),len(sug)))
                        if not tok_sug and tmp_lev_perc < lev_perc:
                            tok_sug = sug
                            lev_perc = tmp_lev_perc
                if tok_sug:
                    new_tok_list.append(tok_sug)
                    tok_stem = en_stem.stem(tok_sug)
                    if tok_stem != tok_sug:
                        new_tok_list.append(tok_stem)
                elif alwayskeep:
                    new_tok_list.append(tok)
            elif alwayskeep:
                new_tok_list.append(tok)
        else:
            new_tok_list.append(tok)
    out_str = string.join(new_tok_list, ' ')
    return out_str.lower()
Esempio n. 28
0
 def _execute(self):
     
     corpus = mongoExtractText(self.name)
     stemmer = EnglishStemmer()
     for item in corpus:
         line = item.replace(',', ' ')
         stemmed_line = stemmer.stem(line)
         self.sentiment.append((sentiment.sentiment(stemmed_line), stemmed_line))
Esempio n. 29
0
 def run(self, data):
     english = EnglishStemmer()
     for corpus in data:
         corpus.tokenized_contents = [
             english.stem(word) for word in corpus.tokenized_contents
         ]
         corpus.contents = ''.join(corpus.tokenized_contents)
     return data
Esempio n. 30
0
def tokenize_en(text):
    def is_ok(item):
        return True if item.lower() == item and all(elem.isalpha() and elem in string.ascii_letters for elem in item) else False
    from nltk.stem.snowball import EnglishStemmer
    stemmer = EnglishStemmer(ignore_stopwords=True)
    tokens = word_tokenize(text)
    result = [stemmer.stem(item) for item in tokens if is_ok(item)]
    return result
Esempio n. 31
0
def as_eng_postagged_doc(doc):
    '''Uses nltk default tagger.'''
    tags    = [t for _, t in nltk.pos_tag(list(doc.word))]
    stemmer = EnglishStemmer()
    lemmata = [stemmer.stem(w) for w in list(doc.word)]
    doc['pos']   = Series(tags)
    doc['lemma'] = Series(lemmata)
    return doc
Esempio n. 32
0
def stem_word(word):
    """
    Stem words
    :param word: (str) text word
    :returns: stemmed word
    """
    stemmer = EnglishStemmer()
    return stemmer.stem(word)
Esempio n. 33
0
 def parse(self, text):
     stemmer = EnglishStemmer()
     text = text.decode('utf-8')
     tokens = filter(
         lambda token: not self.stop_set.__contains__(token),
         map(lambda token: stemmer.stem(token.lower()),
             wordpunct_tokenize(text)))
     return tokens
Esempio n. 34
0
 def stemArticle(self, doc):
     stemmer_fr = FrenchStemmer()
     stemmer_en = EnglishStemmer()
     
     stemmedArticle = [str(stemmer_fr.stem(w)) for w in doc]
     stemmedArticle = [str(stemmer_en.stem(w)) for w in stemmedArticle]   
     
     return stemmedArticle
Esempio n. 35
0
def tokenize(text, stem=False):
    tokens = [word for word in word_tokenize(text) if word.isalpha()]

    if stem:
        stemmer = EnglishStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)
Esempio n. 36
0
def word_count(document):
    words = get_words(document["content"])
    stemmer = EnglishStemmer()
    words = [stemmer.stem(word) for word in words]
    fdist = FreqDist(words)
    for word, frequency in fdist.most_common(50):
        print(u'{};{}'.format(word, frequency))

    fdist.plot(30, cumulative=False)
Esempio n. 37
0
class TextProcessor:
    def __init__(self):
        self._stemmer = EnglishStemmer()

    def process(self, text: str) -> [str]:
        words = filter(
            lambda word: word not in stopwords.words("english") and word.
            isalpha(), word_tokenize(text))
        return [self._stemmer.stem(word) for word in words]
Esempio n. 38
0
def text_processing(text, min_size=4, sep_char=' '):
	from nltk.stem.snowball import EnglishStemmer
	from nltk.corpus import stopwords as stwds

	stemmer = EnglishStemmer()
	stopwords = set(stwds.words('english') + 
			contractions_without_punc)
	
	text = [stemmer.stem(w) for w in text.split(sep_char) 
			if not w in stopwords
			and len(w) >= min_size]

	return text
	words = list()
	for word in text:
		words.append(stemmer.stem(word))
	
	return words
def computeSentiment(tweet_text):
    pos_count = 0
    neg_count = 0
    pos_terms = []
    neg_terms = []
    st = EnglishStemmer()

    tokenized_tweet = tokenize(tweet_text)
    for t in tokenized_tweet:
        #print st.stem(t.lower())
        if st.stem(t.lower()) in negative_terms:
            neg_terms.append(t.lower())
            neg_count += 1
        elif st.stem(t.lower()) in positive_terms:
            pos_terms.append(t.lower())
            pos_count += 1

    return pos_count, neg_count, set(pos_terms), set(neg_terms)
Esempio n. 40
0
def text_processing(text, min_size=4, sep_char=' '):
    from nltk.stem.snowball import EnglishStemmer
    from nltk.corpus import stopwords as stwds

    stemmer = EnglishStemmer()
    stopwords = set(stwds.words('english') + contractions_without_punc)

    text = [
        stemmer.stem(w) for w in text.split(sep_char)
        if not w in stopwords and len(w) >= min_size
    ]

    return text
    words = list()
    for word in text:
        words.append(stemmer.stem(word))

    return words
Esempio n. 41
0
 def use_snowball_stemmer(self,word):
     """
     return stemmed words used snowball algorithm
     :param word:
     :return:
     """
     englishStemmer=EnglishStemmer()
     stemmed_word= englishStemmer.stem(word)
     return stemmed_word
Esempio n. 42
0
class StemTokenizer(object):
    """
	Transform each word to its stemmed version
	e.g. studies --> studi
	"""
    def __init__(self):
        self.st = EnglishStemmer()

    def __call__(self, doc):
        return [self.st.stem(t) for t in word_tokenize(doc)]
Esempio n. 43
0
def getLemmatizerInfo(pathArticle):

    data = open(pathArticle, "r")
    text1 = data.read().decode('utf-8')

    sourceText = text1

    links1 = []
    l = 0
    for q in text1.split():
        if q == '\ufeff':
            continue
        links1.append([text1.find(q,l), q])
        l = len(q) + 1 + text1.find(q,l)

    text1 = text1.replace(' - ', ' \u2013 ', text1.count(' - '))
    text1 = text1.replace(' -', ' \u2013', text1.count(' -'))
    text1 = text1.replace('- ', '\u2013 ', text1.count('- '))
    text1 = text1.replace('-', ' - ', text1.count('-'))
    text1 = text1.replace('(', '( ', text1.count('('))
    text1 = text1.replace(')', ' )', text1.count(')'))
    text1 = text1.replace(' \u0027', ' \u301E', text1.count(' \u0027'))
    text1 = text1.replace('\u0027', ' \u0027', text1.count('\u0027'))
    text1 = text1.split()
    if text1[0] == u'\ufeff':
        text1=text1[1:]
    text = []
    for word in text1:
        text2 = []
        if len(word) == 0:
            continue
        while word[len(word)-1] in [',','.','!','?',':',';']:
            text2.append(word[len(word)-1])
            word = word[:-1]
            if len(word) == 0:
                break
        text.append(word)
        for i in range(len(text2)-1, -1,-1):
            text.append(text2[i])

    out = ''

    st = EnglishStemmer()

    l = 0
    links = []
    for word in text:
        if isOk(word):
            q = st.stem(word) + ' '
        else:
            q = word + ' '
        out += q.lower()
        links.append([l, q])
        l += len(q)
    return out, links, links1, sourceText
Esempio n. 44
0
def getLemmatizerInfo(pathArticle):

    data = open(pathArticle, "r")
    text1 = data.read().decode("utf-8")

    sourceText = text1

    links1 = []
    l = 0
    for q in text1.split():
        if q == "\ufeff":
            continue
        links1.append([text1.find(q, l), q])
        l = len(q) + 1 + text1.find(q, l)

    text1 = text1.replace(" - ", " \u2013 ", text1.count(" - "))
    text1 = text1.replace(" -", " \u2013", text1.count(" -"))
    text1 = text1.replace("- ", "\u2013 ", text1.count("- "))
    text1 = text1.replace("-", " - ", text1.count("-"))
    text1 = text1.replace("(", "( ", text1.count("("))
    text1 = text1.replace(")", " )", text1.count(")"))
    text1 = text1.replace(" \u0027", " \u301E", text1.count(" \u0027"))
    text1 = text1.replace("\u0027", " \u0027", text1.count("\u0027"))
    text1 = text1.split()
    if text1[0] == u"\ufeff":
        text1 = text1[1:]
    text = []
    for word in text1:
        text2 = []
        if len(word) == 0:
            continue
        while word[len(word) - 1] in [",", ".", "!", "?", ":", ";"]:
            text2.append(word[len(word) - 1])
            word = word[:-1]
            if len(word) == 0:
                break
        text.append(word)
        for i in range(len(text2) - 1, -1, -1):
            text.append(text2[i])

    out = ""

    st = EnglishStemmer()

    l = 0
    links = []
    for word in text:
        if isOk(word):
            q = st.stem(word) + " "
        else:
            q = word + " "
        out += q.lower()
        links.append([l, q])
        l += len(q)
    return out, links, links1, sourceText
Esempio n. 45
0
def get_query(vec_dict):
    dim = 300  # Dimension of the GloVe vectors chosen

    # initialize stemmer for search in GLoVe vector space
    st = EnglishStemmer()

    query = raw_input("Please enter search query:")
    query_vector = np.zeros(dim)
    numWords = 0
    for word in query.split():
        if st.stem(word) in vec_dict:
            query_vector += vec_dict[st.stem(word)].astype(np.float)
            numWords += 1
        elif st.stem(word) + "e" in vec_dict:
            query_vector += vec_dict[st.stem(word) + "e"].astype(np.float)
            numWords += 1

    query_vector /= numWords

    return query, query_vector
def stemming(tweet):
    tweets = tweet.split()
    wrdStemmer = EnglishStemmer()
    stemTweet =[]
    try:
        for tweet in tweets:
            tweet = wrdStemmer.stem(tweet)
            stemTweet.append(tweet)
    except:
        print("Error: Stemming")
    return " ".join(stemTweet)
Esempio n. 47
0
class EnglishStemmer(PreProcessor):

    def __init__(self):
        self.stemmer = SnowballEnglishStemmer()

    def process_sentence(self, sentence):
        stemmed_sentence = []
        for token in wordpunct_tokenize(sentence):
            if len(token) > 1:
                stemmed_sentence.append(self.stemmer.stem(token))
        return ' '.join(stemmed_sentence)
Esempio n. 48
0
def main(fname):
  e = EnglishStemmer()

  n, a = 0, 0
  for line in open(sys.argv[1]):
    title, body, tags, creationdate, acceptedanswerid, score, viewcount = eval(line)

    # Process text into tokens
    html_tags = RX_OPEN_TAGS.findall(body)
    body = RX_TAGS.sub("",body)
    print " ".join(e.stem(s) for s in RX_NONWORD.split(body))
    M = bayes.NaiveLearner(adjust_threshold=True, name="Adjusted Naive Bayes")
def get_stemmed_keywords(keywords):

  stemmer = EnglishStemmer()
  stemmed_keywords = list(keywords)
  # split into list of list
  stemmed_keywords = [keyword.split() for keyword in stemmed_keywords]
  # stem individual words
  stemmed_keywords = [list(stemmer.stem(word) for word in keyword) for keyword in stemmed_keywords]
  # list of words to string
  stemmed_keywords = [' '.join(keyword).encode('ascii') for keyword in stemmed_keywords]

  return stemmed_keywords
Esempio n. 50
0
def stemmed(text, snowball=False):
    """Returns stemmed text
    """
    if snowball:
        st = EnglishStemmer()
    else:
        st = PorterStemmer()
    words = wordpunct_tokenize(text)
    words = [st.stem(w) for w in words]
    text = ' '.join(words)

    return text
Esempio n. 51
0
def nltk_tokenizer(text, min_size=4, *args, **kwargs):
	from nltk.stem.snowball import EnglishStemmer
	from nltk.corpus import stopwords as stwds
	from nltk.tokenize import TreebankWordTokenizer
	
	stemmer = EnglishStemmer()
	stopwords = set(stwds.words('english'))
	
	text = [stemmer.stem(w) for w in TreebankWordTokenizer().
			tokenize(text) if not w in stopwords 
			and len(w) >= min_size]

	return text
Esempio n. 52
0
def normalize_tags():
    cursor.execute('SELECT app_id, tag, times FROM tag_app_rel;')
    all_tag_data = defaultdict(dict)
    for r in cursor:
        all_tag_data[r[0]][r[1]] = r[2]
    from nltk.stem.snowball import EnglishStemmer
    stemmer = EnglishStemmer()
    for app_id, tag_to_times in all_tag_data.iteritems():
        normalized_app_tag_dict = defaultdict(int)
        for tag, times in tag_to_times.iteritems():
            normalized_app_tag_dict[stemmer.stem(tag)] += times
        for tag, times in normalized_app_tag_dict.iteritems():
            cursor.execute('INSERT INTO tag_app_relation (app_id, tag, times) VALUES (%s, %s, %s)', (app_id, tag, times))
Esempio n. 53
0
def tokenize_documents(documents):

    stop_words = stopwords.words('english') + stopwords.words('spanish') #common words to be filtered
    english = EnglishStemmer()
    arabic = ISRIStemmer()

    punctuation = { ord(char): None for char in string.punctuation}

    def valid_word(token, filtered=stop_words): 
        # Returns false for common words, links, and strange patterns
            if (token in filtered) or (token[0:4] == u'http') or\
            (token in string.punctuation):
                return False
            else:
                return True

    for doc in documents:

        row = doc[0]
        doc = doc[1]

        if doc is not None:

            # remove trailing whitespace
            doc = doc.strip()
            # remove twitter handles (words in doc starting with @)
            doc = re.sub(r"@\w+|\b@\w+", "", doc)
            # lowercase letters
            doc = doc.lower()
            # remove punctuation
            doc = doc.translate(punctuation)

            # tokenization: handles documents with arabic or foreign characters
            tokens = nltk.tokenize.wordpunct_tokenize(doc)

            cleaned_tokens = []
            for token in tokens:

                # for valid words, correct spellings of gaddafi and stem words
                if valid_word(token):
                
                    if token in [u'gadhafi', u'gadafi', u'ghadhafi', u'kadhafi', u'khadafi', u'kaddafi']:
                        token = u'gaddafi'
                    else:
                        token = arabic.stem(english.stem(token)) 

                    cleaned_tokens.append(token)    

            yield row
            yield cleaned_tokens
Esempio n. 54
0
def stemVector(vector,method="lemmatize"):
    output=[]
    if method=='lemmatize':
        wnl = WordNetLemmatizer()
        for i in vector:
            i=wnl.lemmatize(i)
            output.append(i)
    if method=='snowball':
        st=EnglishStemmer()
        for i in vector:
            i=st.stem(i)
            output.append(i)
    if method=='porter':
        st=PorterStemmer()
        for i in vector:
            i=st.stem(i)
            output.append(i)
    if method=='lancaster':
        st=LancasterStemmer()
        for i in vector:
            i=st.stem(i)
            output.append(i)
    return output
def stem_sen(list_sentences):
  stemmer = EnglishStemmer()
  # map back should be a dict with words,
  # each word map to 3 version: noun, adj, verb,
  # and each version is a list of pair
  lem = WordNetLemmatizer()
  mapping_back = {}
  res_list = []
  res_sen = []
  stemmer = EnglishStemmer()

  # of course we want to return a list of sentences back as well
  for sent in list_sentences:
    tmp_list = []
    tok_list = word_tokenize(sent)
    tok_pos = nltk.pos_tag(tok_list)
    for tok,pos in tok_pos:
      if (tok.lower() in stopwords.words('english')):
        continue
      if len(tok) == 1:
        continue
      tok = lem.lemmatize(tok)
      pos = pos[:2]
      if ('NN' not in pos) and ('JJ' not in pos) and ('VB' not in pos):
        continue
      stem_tok = stemmer.stem(tok)
      if (stem_tok not in mapping_back):
        mapping_back[stem_tok] = {}
      if pos not in mapping_back[stem_tok]:
        mapping_back[stem_tok][pos] = {}

      # increase count
      if tok not in mapping_back[stem_tok][pos]:
        mapping_back[stem_tok][pos][tok] = 1
      else:
        mapping_back[stem_tok][pos][tok] += 1
      tmp_list.append(stem_tok + '-' + pos)
    res_sen.append(tmp_list)
  res_map = {}

  # do the second run through to find the most frequent - mapping
  for tok in mapping_back:
    for pos in mapping_back[tok]:
      tmp_tok = tok + '-' + pos
      # find the most frequently, unstemmed word correspond to the stemmer + tagged
      most_freq = max(mapping_back[tok][pos], key = mapping_back[tok][pos].get)
      res_map[tmp_tok] = most_freq.encode('ascii')
      res_list.append(tmp_tok)
  return res_sen, res_list, res_map
Esempio n. 56
0
 def tokenize(self):
     terms = word_tokenize(self.text);
     self.tokens = [];
     self.lemmas = []
     stemmer = EnglishStemmer();
     lemmatizer = WordNetLemmatizer()
     for term in terms:
         try:
             self.tokens.append(stemmer.stem(term).lower())
             self.lemmas.append(lemmatizer.lemmatize(term.lower()))
         except Exception, e:
             print 'current text:', self.text;
             print 'current term:', term;
             print str(e);
             sys.exit(-1);
def computeSentiment(tweet_text):
    annotated = ''
    positive = 0
    negative = 0
    st = EnglishStemmer()

    tokenized_tweet = tokenize(tweet_text)
    for t in tokenized_tweet:
        #print st.stem(t.lower())
        wsp = ' '
        if len(annotated) == 0 or annotated[-1] in '@#':
            wsp = ''
        if st.stem(t.lower()) in negative_terms:
            annotated += wsp+'<span class="negative">'+t+'</span>' 
            negative += 1
        elif st.stem(t.lower()) in positive_terms:
            annotated += wsp+'<span class="positive">'+t+'</span>'
            positive += 1
        else:
            if len(t) == 1 and t not in '@#':
                annotated += t
            else: annotated += wsp + t

    return annotated, positive, negative
Esempio n. 58
0
def exe_compress_word(argv):
    word_stat_path, comp_word_stat_path = argv;
    stemmer = EnglishStemmer();
    word_stat = load_word_stat(word_stat_path);
    compress_word_stat = {};
    for word, count in word_stat.items():
        if count <= 0:
            continue;
        word = stemmer.stem(word.lower().decode('utf8'));
        compress_word_stat.__setitem__(word, max(word_stat.get(word,0), count));
    words = compress_word_stat.keys();
    words.sort();
    f = open(comp_word_stat_path, 'w');
    for word in words:
        f.write('%s %d\n' % (word.encode('utf8'), compress_word_stat[word]));
    f.close();
Esempio n. 59
0
def tokenizeTweet(tweet,unique = True):
	allWords = [word.lower() for word in word_tokenize(tweet)]

    # deletes @users, RT and URLs and saves #hashtags
	nWords, i = len(allWords), 0
	hashtags = []
	while i < nWords:
		if allWords[i] == '@':      # @users
			allWords[i:i + 2] = []
			nWords -= 2
		elif allWords[i] == 'rt':   # delete RT
			allWords[i:i + 1] = []
			nWords -= 1
		elif allWords[i] == '#':    # save the hashtag
			try:
				hashtags.append(allWords[i + 1])
				allWords[i:i + 2] = []
				nWords -= 2
			except:
				allWords[i:i + 1] = []
				nWords -= 1
		elif allWords[i] == "http":     # delete url starting with http:
			allWords[i:i + 3] = []
			nWords -= 3
		elif allWords[i][0:3] == 'www':  # delete urls starting with www.
			allWords[i:i + 1] = []
			nWords -= 1
		else:
			i += 1

	possibleWords = filter(lambda x: x not in ourStopWords and x.isdigit() == False, allWords)
	stemmer = EnglishStemmer()
	tokens = []
	for word in possibleWords:
		aux = str(stemmer.stem(word))
		if unique:
			if(aux not in tokens):		# this makes each token appears only once
				tokens.append(aux)
		else:
			tokens.append(aux)			
	for tag in hashtags:		# this makes each token appears only once
		if unique:
			if '#' + tag not in tokens:
				tokens.append('#' + tag)
		else:
			tokens.append('#'+tag)
	return tokens