def remove_similar_words(word_list, master_list):
    # Could replace double for-loops with itertools.combinations ?
    new_words = []
    stemmer = LancasterStemmer()
    for i in sorted(set(word_list)):
        x = []
        ii = re.sub("[^\w\s]","",i)
        ii = re.sub("_","",ii)
        ii = re.sub("[ ]"," ",ii)
        ii = stemmer.stem(ii)
        for j in sorted(master_list):
            jj = re.sub("[^\w\s]","",j.decode('UTF-8'))
            jj = re.sub("_","",jj)
            jj = re.sub("[ ]"," ",jj)
            if i[:3]==j[:3]:
                jj = stemmer.stem(jj)
                if ii == jj:
                    sim = nltk.edit_distance(i,j)
                    if sim > 1:
                        x.append(i)
                else:
                    x.append(i)
        x = list(set(x))
        new_words+=x
    new_words = list(set(new_words))
    master_list=list(set(master_list))
    new_words = [i for i in new_words if i not in master_list]
    master_list.extend(list(set(new_words)))
    return new_words, master_list
Example #2
0
def lemmatize(s):
    lancaster = LancasterStemmer()
    words = s.split()
    for i in range(0, len(words)):
        words[i] = lancaster.stem(words[i])
    s = ' '.join(words)
    return s
class Tokenizer():

    def __init__(self):
        self.stemmer = LancasterStemmer()

    def __call__(self, text):
        return [self.stemmer.stem(token) for token in word_tokenize(text)]
Example #4
0
def stem_words(words):
    stemmer = LancasterStemmer()
    stems_words = []
    for word in words:
        rstem = stemmer.stem(word)
        stems_words.append(rstem + " ")
    return stems_words
Example #5
0
def stem_text(text):
    from nltk.stem import LancasterStemmer
    ls = LancasterStemmer()
    tokens = tokenize_text(text)
    filtered_tokens = [ls.stem(token) for token in tokens]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
Example #6
0
def preprocess(raw_string):
    stop_words = set(stopwords.words('english')) 
    res = ''
    if raw_string != 'NA':
        res = str(raw_string).lower().split()
        
        # Remove stopwords
        res = [word for word in res if word not in stop_words]
        
        # Remove punctuation
        tokenizer = RegexpTokenizer(r'\w+')

        tmp_res = []
        for s in res:
            tokens = tokenizer.tokenize(s)
            tmp_res.extend(tokens)
        res = tmp_res
        
        
        # Stemming
        stemmer = LancasterStemmer()
        tmp_res = []
        for s in res:
            if s.isalpha():
                tmp_res.append(stemmer.stem(s))
            else:
                tmp_res.append(s)
        res = tmp_res
        
    else:
        res = ['NA']
        
    return res
Example #7
0
def stemming(tokens):
    stemmer = LancasterStemmer()
    stems = []
    for word in tokens:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
def stem_words(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
def lancaster_stemming(lista_doc):
    lista = []
    stemmer = LancasterStemmer()
    for doc in lista_doc:
        for word in doc:
            lista.append(stemmer.stem(word))
    return lista
Example #10
0
class Tokenizer(object):
    def __init__(self):
        self.tok = RegexpTokenizer(r'some_regular_expression')
        self.stemmer = LancasterStemmer()

    def __call__(self, doc):
        return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)]
Example #11
0
def strip_sentiment_list(data):
    lanc = LancasterStemmer()
    list_of_words = list()

    for i in data:
        new = re.sub(r'http\S+', '', i)

        for char in [
                '!', '?', ',', '.', '@', '#', '&', '\n', '"', '$', '%', "'",
                '(', ')', '*', '-', '+', '/', '^', '[', ']', '”', '_', ':',
                ';', '|', '{', '}', '~', '€', '£', '“', '1', '2', '3', '4',
                '5', '6', '7', '8', '9', '0', '=', '°', 'º', 'ʖ', '…', '⤵',
                '↔️'
        ]:
            new = new.replace(char, '')

        new = strip_emoji(new)
        new = new.lower()
        new = new.split(" ")

        for j in new:
            new_word = lanc.stem(j)
            list_of_words.append(new_word)

    return strip_stopwords(list_of_words)
Example #12
0
class Baseline(object):
  def __init__(self):
    self.stemmer = LancasterStemmer()
    self.stopwords = set([self.stemmer.stem(word) for word in stopwords])

  def stem(self, doc):
    return [self.stemmer.stem(word) for word in doc]

  def doc_similarity(self, s1, s2, pairId=None):
    s1 = s1.lower().split()
    s2 = s2.lower().split()
    s1 = self.stem(s1)
    s2 = self.stem(s2)
    s1 = set(s1) - self.stopwords
    s2 = set(s2) - self.stopwords
    return float(len(s1.intersection(s2)))/((len(s1)+len(s2)))
Example #13
0
def read_files(categories):
	feats = list ()
	#porter = PorterStemmer()
	lancaster = LancasterStemmer()
	print("\n##### Reading files...")
	for category in categories:
		files = get_filenames_in_folder('Volkskrant/' + category)
		num_files=0
		for f in files: 

            data = open('Volkskrant/' + category + '/' + f, 'r', encoding='UTF-8').read()
			#data = data.lower()
			#data = porter.stem(data)
			tokens = word_tokenize(data)
			lancaster_list = [lancaster.stem(token) for token in tokens]

			#bag = bag_of_words(tokens)
			#ww=high_information([(bag, category)], [category])
			#bag = bag_of_words(lancaster_list)

			#bag = bag_of_non_stopwords(tokens)
			bag = bag_of_non_stopwords(lancaster_list)
			feats.append((bag, category))
			#print len(tokens)
			num_files+=1
Example #14
0
def getSearchEngineResult(query_dict):
    result_dict = {}
    ix = index.open_dir("index")

    # with ix.searcher(weighting=scoring.BM25F()) as searcher:
    with ix.searcher(weighting=scoring.ScoringFunction()) as searcher:
        # TODO - Define your own query parser
        parser = QueryParser("contents",
                             schema=ix.schema,
                             group=OrGroup.factory(0))
        stemmizer = LancasterStemmer()
        stopWords = set(stopwords.words('english'))

        # print(stopWords)
        for qid, q in query_dict.items():

            table = str.maketrans('\n?.,!', '     ')
            q_nomark = q.translate(table)

            new_q = ''
            for word in q_nomark.split(' '):
                if word.lower() not in stopWords:
                    word_stem = stemmizer.stem(word.lower())
                    new_q += word_stem + ' '
            # print(new_q)
            query = parser.parse(new_q.lower())
            results = searcher.search(query, limit=None)
            # for result in results:
            #     print(result.fields()['docID'], result.score)

            result_dict[qid] = [result.fields()['docID'] for result in results]
    return result_dict
Example #15
0
def get_word_stems(date, num_subreddits):
    import nltk
    from nltk.stem import PorterStemmer
    from nltk.stem import LancasterStemmer
    from nltk.stem import WordNetLemmatizer

    table_id = f"top_{num_subreddits}_word_counts"

    query = f"""SELECT *
                FROM
                `{date}.{table_id}`
                """
    client = bigquery_client()

    job_config = bigquery.QueryJobConfig()
    job_config.use_legacy_sql = False

    query_job = client.query(query, job_config=job_config)

    df = query_job.to_dataframe()

    porter = PorterStemmer()
    lancaster=LancasterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()

    copy = df.copy()

    copy['porter']=copy['word'].apply(lambda x: porter.stem(x))
    copy['lancaster']=copy['word'].apply(lambda x: lancaster.stem(x))
    copy['lemmatised']=copy['word'].apply(lambda x: wordnet_lemmatizer.lemmatize(x))

    store_blob(copy, bucket_name=date, blob_name=f"{table_id}_stems.csv")
Example #16
0
def preproc(twts):

    stop_words = set(stopwords.words('english'))
    stab = []
    lstem = LancasterStemmer()
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    REPLACE_REP = re.compile(r"(\w)\1{2,}")

    #abst = [i for sublist in dat for i in sublist]
    i = 0
    for tweets in twts:
        tweets = tweets.lower()
        tweets = re.sub(r'[^\w\s]', '', tweets)
        tweets = REPLACE_NO_SPACE.sub("", tweets)
        tweets = REPLACE_WITH_SPACE.sub(" ", tweets)
        tweets = REPLACE_REP.sub("", tweets)
        word_tokens = word_tokenize(tweets)
        filtered_sentence = [
            lstem.stem(w) for w in word_tokens if not w in stop_words
        ]
        stab.append(filtered_sentence)
        i += 1
        print(i / len(twts))

    stab = [remove_num_nword(i) for i in stab]
    stab = list(map(lambda x: ' '.join(x), stab))
    return stab
Example #17
0
def stem_words(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
Example #18
0
    def process(self):
        """
        Apply all the specified preprocessing steps to the tokens from a segment
        :return: Tokens, with preprocessing steps applied to them
        """

        processed_tokens = self.tokens

        if self.stopword_removal:
            stop = set(stopwords.words('english'))
            processed_tokens = [self.case(token[0]) for token in processed_tokens if token[0] not in stop]

        if self.stemming:
            if self.stemmer == 'lancaster':
                ls = LancasterStemmer()
                processed_tokens = [self.case(ls.stem(token[0])) for token in processed_tokens]
            elif self.stemmer == 'snowball':
                ss = SnowballStemmer('english')
                processed_tokens = [self.case(ss.stem(token[0])) for token in processed_tokens]

            else:
                ps = PorterStemmer()
                processed_tokens = [self.case(ps.stem(token[0])) for token in processed_tokens]

        if self.lemmatization:
            lemma = nltk.wordnet.WordNetLemmatizer()
            processed_tokens = [self.case(lemma.lemmatize(token[0], token[1])) for token in processed_tokens]

        # no preprocessing method was selected, return only [Bagdad, car,...] instead of [[Bagdad, NNP], [car, NN], [..]..]
        if not self.stopword_removal and not self.stemming and not self.lemmatization:
            processed_tokens = [self.case(token[0]) for token in processed_tokens]

        return processed_tokens
Example #19
0
def stem_metn(request):
    soz_class = NameForm
    cumle_class = TextForm
    morf_class = SozForm
    porter = PorterStemmer()
    lancaster = LancasterStemmer()
    k = request.POST.get('metn', '')
    alqo = request.POST.get('alqo', '')
    txt = k
    if alqo == 'Bizim Alqoritm':
        txt = metn_oxu(k)
    elif alqo == 'Porter Alqoritmi':
        txt = porter.stem(txt)
    elif alqo == 'Lancaster Alqoritmi':
        txt = lancaster.stem(txt)
    elif alqo == 'WordNet Alqoritmi':
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = metn_oxu(wordnet_lemmatizer.stem(k))

    return render(request, 'metn.html', {
        'form': soz_class,
        'cumle': cumle_class,
        'morf': morf_class,
        'txt': txt
    })
Example #20
0
def train_bayes(train_list, N_d):
    log_prior = {}
    V = set()
    big_doc = {}
    log_likelihood = {}
    alpha = 0.5
    lc = LancasterStemmer()
    tags = ['NN', 'NNS', 'NNP', 'NNPS']
    stop_words = set(stopwords.words('english'))
    for c in train_list.keys():
        # log_prior
        for file in train_list[c]:
            if c in log_prior:
                log_prior[c][0] = log_prior[c][0] + 1 / N_d
                log_prior[c][1] = math.log(log_prior[c][0])
            else:
                log_prior[c] = []
                log_prior[c].append(1 / N_d)
                log_prior[c].append(math.log(1 / N_d))

            # V & big_doc
            cur_file = open(file, 'r')
            token_file = sent_tokenize(cur_file.read())
            for sent in token_file:
                token_sent = word_tokenize(sent)
                tagged_sent = nltk.pos_tag(token_sent)
                for word in tagged_sent:
                    weight = 1
                    if word[1] in tags:
                        weight = 2
                    stem_word = lc.stem(word[0])
                    if stem_word not in stop_words:
                        if stem_word not in V:
                            V.add(stem_word)
                        if c in big_doc:
                            if stem_word in big_doc[c]:
                                big_doc[c][stem_word] += weight
                            else:
                                big_doc[c][stem_word] = weight
                        else:
                            big_doc[c] = {}
                            big_doc[c][stem_word] = weight
            cur_file.close()

    big_doc_size = {}
    for c in train_list.keys():
        big_doc_size[c] = sum(big_doc[c].values())

    # log_likelihood
    for c in train_list.keys():
        for w in V:
            if w not in big_doc[c]:
                log_likelihood[(w, c)] = math.log(alpha / (big_doc_size[c] +
                                                           (alpha * len(V))))
            else:
                log_likelihood[(w, c)] = math.log(
                    (big_doc[c][w]) / (big_doc_size[c]))

    return log_prior, log_likelihood, V
Example #21
0
 def stemmingLS(self):
     from nltk.stem import LancasterStemmer
     ls = LancasterStemmer()
     temp = copy.deepcopy(self.data)
     for i in range(len(temp)):
         for j in range(len(temp[i][1])):
             temp[i][1][j] = ls.stem(temp[i][1][j])
     return temp
Example #22
0
def lancaster_stemmer(text):
    lancaster = LancasterStemmer()
    stemmed_words = []

    for word in text.split(" "):
        stemmed_words.append(lancaster.stem(word))

    return " ".join(stemmed_words)
Example #23
0
def token_stemming(x):
    '''Uses the LancasterStemmer from nltk to stem tokens in the provided
    list of lists `x`'''

    stemmer = LancasterStemmer()
    x = [[stemmer.stem(w) for w in s] for s in x]
    
    return x
def calculate_uncommon_words_percent(text_file_name):
    # read the text from a file
    text_file = open(text_file_name)
    text = text_file.read()
    text_file.close()

    # replace punctuation with empty strings
    punctuation_marks = [
        '.', ',', ':', ';', '(', ')', '!', '?', '[', ']', '$', '"', "'", '’',
        '-', '–', '●', '\t', '“', '”', '\n', '0', '1', '2', '3', '4', '5', '6',
        '7', '8', '9'
    ]
    for punctuation_mark in punctuation_marks:
        text = text.replace(punctuation_mark, ' ')

    # chop the text into words, using spaces as word dividers
    words_in_text = text.split()

    stemmer = LancasterStemmer()
    stems_in_text = []
    for word in words_in_text:
        stem = stemmer.stem(word)
        lower_case_stem = stem.lower()
        stems_in_text.append(lower_case_stem)

    # read the 1000 most common English words from a file
    common_words_file = open('most_common_words.txt')
    common_words_text = common_words_file.read()
    common_words_file.close()

    common_words = common_words_text.split()

    common_stems = []
    for word in common_words:
        stem = stemmer.stem(word)
        lower_case_stem = stem.lower()
        common_stems.append(lower_case_stem)

    uncommon_stems = []
    for stem in stems_in_text:
        if not (stem in common_stems):
            uncommon_stems.append(stem)

    pct_uncommon_stems = len(uncommon_stems) / len(stems_in_text)
    rounded_pct_uncommon_stems = round(pct_uncommon_stems, 3)
    return rounded_pct_uncommon_stems
Example #25
0
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
Example #26
0
 def get_stems(self):
     stemmerlan = LancasterStemmer()
     stems = []
     for i in self.words:
         stems.append(stemmerlan.stem(i))
     print("\n提取词干后:")
     print(stems)
     return stems
def lancaster_stem(tokenize_list):
    l = LancasterStemmer()
    result = []

    for w in tokenize_list:
        result.append(l.stem(w))

    return result
 def stem_words(row):
     stemmed_row = ''
     lancaster = LancasterStemmer()
     for word in row:
         word = word.strip()
         stemmed_word = lancaster.stem(word)
         stemmed_row += stemmed_word + ' '
     return stemmed_row
Example #29
0
def stem_words(words):
    stemmer = LancasterStemmer()
    stems_words = []
    for word in words:
        stem = stemmer.stem(word)
        stems_words.append(stem)
        stems_words.append(' ')  #add space between words
    return stems_words
Example #30
0
def stemSentence(sentence):
     lancaster = LancasterStemmer()
     token_words = word_tokenize(sentence)
     stem_sentence = []
     for word in token_words:
         stem_sentence.append(lancaster.stem(word))
         stem_sentence.append(" ")
    
     return "".join(stem_sentence)
Example #31
0
def stem_words(wrd):
    stemmer = LancasterStemmer()  # Selects the stemmmer from nltk
    stems = []  # List of updated words

    for word in wrd:
        stem = stemmer.stem(word)  # Stems the word
        stems.append(stem)  # and appends it to the list

    return stems
Example #32
0
 def words_stem(tokens):
     """
     convert all words to root of that word
     :param tokens:
     :return:
     """
     from nltk.stem import PorterStemmer, LancasterStemmer
     stemmer = LancasterStemmer()
     return [stemmer.stem(word) for word in tokens]
Example #33
0
def lemmatization(lista_palabras_sp_ssw_scr):
    stemmerlan = LancasterStemmer()
    lista_palabras_sp_ssw_scr_stem = []
    for word in lista_palabras_sp_ssw_scr:
        lista_palabras_sp_ssw_scr_stem.append(stemmerlan.stem(word))
    tokens = set(lista_palabras_sp_ssw_scr_stem)

    #print(tokens)
    return tokens
Example #34
0
class TFIDF:

    def __init__(self):
        self.pickle_docs = "tfidf_pickle_docs"
        self.pickle_corpus = "tfidf_pickle_corpus"
        self.lan = LancasterStemmer()
        self.construct()
        #print sorted(self.words.iteritems(), key = operator.itemgetter(1), reverse=True)[:20]

    def clean(self, word):
        '''cleans a word or returns None if it should not be considered'''
        word = word.strip(string.punctuation)
        word = self.lan.stem(word)
        return word
    
    def construct(self):
        corpus = {}

        # Check to see if we should simply load a pickle
        if os.path.isfile(self.pickle_docs):
            with open(self.pickle_docs) as docs_file:
                current_doclist = pickle.load(docs_file)
                if os.listdir('articles/') == current_doclist:
                    # current article list is the same as pickled article list
                    # so we want to just load the stored pickled corpus data
                    with open(self.pickle_corpus) as corpus_file:
                        self.words = pickle.load(corpus_file)
                        self.n = len(current_doclist)
                        return
        
        # If we don't load a pickle, build the corpus from articles/ dir
        num_docs = 0.0
        for file_name in os.listdir('articles/'):
            num_docs += 1
            doc = {}
            with open("articles/" + file_name) as article:
                for line in article:
                    for word in tokenize(line, "word", return_spans=False):
                        word = self.clean(word)
                        doc[word] = 1
            for key in doc.keys():
                corpus[key] = corpus.get(key, 0) + 1

        self.words = corpus
        self.n = num_docs

        print "Pickling a new TFIDF corpus"
        # pickle corpus and document list
        with open(self.pickle_docs, "w") as docs_file:
            pickle.dump(os.listdir('articles/'), docs_file)
        with open(self.pickle_corpus, "w") as corpus_file:
            pickle.dump(self.words, corpus_file)

    def weight(self, word, count, debug=False):
        if debug:
            return (word, count, self.words.get(word, 1))
        return  count * math.log(self.n / self.words.get(word, 1))
Example #35
0
	def stemWords(self, words):
		"""Stem words in list of tokenized words"""
		if stemmer == "lancaster":
			stemmer = LancasterStemmer()
		elif stemmer == "snowbal":
			stemmer = SnowballStemmer()
		elif stemmer == "porter":
			stemmer = PorterStemmer()
		stems = [stemmer.stem(word) for word in words]
		return stems
Example #36
0
def stemming(word):
    # Use stemmers for removing morphological affixes from words.
    Portst = PorterStemmer()
    Landst = LancasterStemmer()
    Regst = RegexpStemmer('ing|ed')
    new = Portst.stem(word)
    if new == word:
        new = Landst.stem(word)
        if new == word:
            new = Regst.stem(word)
    return new
Example #37
0
class StemTokenizer(object):
    def __init__(self, stemmer_type='Porter'):
        self.stemmer_type = stemmer_type
        if self.stemmer_type == 'Porter':
            self.stemmer = PorterStemmer()
        elif self.stemmer_type == 'Lancaster':
            self.stemmer = LancasterStemmer()
        else:
            raise Exception('Invalid stemmer_type = {0}'.format(stemmer_type))

    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]
Example #38
0
    def stem(self, input_text):
        tokenizer = RegexpTokenizer("\s+", gaps=True)
        stemmed_text = []
        lemmatizer = WordNetLemmatizer()
        stemmer = LancasterStemmer()
        text = tokenizer.tokenize(str(input_text))
        filtered_text = self.stopword(text)
        for word in filtered_text:
            if word.isalpha():
                stemmed_text.append(stemmer.stem(word).lower())

        " ".join(stemmed_text)

        return stemmed_text
Example #39
0
class LancasterTokenizer(object):
    def __init__(self):
        self.ls = LancasterStemmer()
        self.rx = RegexpTokenizer(r"(?u)\b\w\w+\b")

    def isNumber(self, s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    def __call__(self, doc):
        return [self.ls.stem(t) for t in self.rx.tokenize(doc) if not self.isNumber(t)]
Example #40
0
class Tokenizer():
    """
    Tokenizes and stems text using NLTK libraries
    """

    def __init__(self):
        """
        Constructs a tokenizer object
        """
        self.stemmer = LancasterStemmer()

    def __call__(self, text):
        """
        Tokenizes text

        :param text: the text to tokenize
        :type text: str or unicode
        :return: a list of tokens
        :rtype: list of (str or unicode)
        """
        return [self.stemmer.stem(token) for token in word_tokenize(text)]
    def word_refiner(*args):
        Portst = PorterStemmer()
        Landst = LancasterStemmer()
        Regst = RegexpStemmer('ing|ed|ly|lly')
        args = [i for i in args if isinstance(i, unicode)]

        for w in map(str, args):
            if w in dic1:
                yield w
            else:
                st1 = Portst.stem(w)
                if st1 in dic1:
                    yield st1
                else:
                    st2 = Landst.stem(w)
                    if st2 in dic1:
                        yield st2
                    else:
                        st3 = Regst.stem(w)
                        if st3 in dic1:
                            yield st3
                        else:
                            yield w
def preprocess(sentence):
    output_list = []

    #CASE FOLDING [NOT COMPLETE]
    sentence = sentence.lower()

    #DATA CLEANING
    sentence = sentence.replace('[https://]?[t.co/]?','')
    sentence = sentence.replace('@','')
    sentence = sentence.replace('[#]?','')
    sentence = sentence.replace('[RT]?','')
    sentence = sentence.replace(',','')
    sentence = sentence.replace('!','')
    sentence = sentence.replace('?','')
    sentence = sentence.replace('.','')
    sentence = sentence.replace('\'','')
    sentence = sentence.replace('\"','')
    sentence = sentence.replace(':','')

    #REMOVE REPEATED CHARS
    #sentence = re.sub(r'(\w)\1+', r'\1', sentence)

    #TOKENIZE
    tt = TweetTokenizer()
    temp = tt.tokenize(sentence)

    #REMOVE STOP WORDS
    stop = stopwords.words('english')

    #STEMMING
    ls = LancasterStemmer()
    newtemp = [eachword for eachword in temp if eachword not in stop]
    for eachword in newtemp:
        output_list.append(ls.stem(eachword))

    return output_list
class LancasterTokenizer(object):
        def __init__(self):
            self.wnl = LancasterStemmer()
        def __call__(self, doc):
            return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)]
import nltk
from nltk.stem import LancasterStemmer
stemmerlan=LancasterStemmer()
print(stemmerlan.stem('working'))
print(stemmerlan.stem('happiness'))
Example #45
0
            Tokens2.append(w)
    #fix ascii again, don't know whats happening here!
    for w in Tokens2:
        for char in w:
            if ord(char) > 128:
                w.
    #stemming
    Tokens3 = []
    for w in Tokens2:
        Tokens3.append(lanStem.stem(w))
    return Tokens3
    

test = clean(tesSum1)

#read files
tesSum1 = teslaSummary1.read()
tesSum5 = teslaSummary5.read()

#stpWrds = set(stopwords.words("english"))
#print stpWrds

sum1SentTok = sentTok(tesSum1)
sum2SentTok = sentTok(tesSum2)

sum1WordTok = wordTok(tesSum1)
sum2WordTok = wordTok(tesSum2)

for w in sum1WordTok:
tryAgain + LancasterStemmer.stem('stemming')
Example #46
0
       else:
           urlUsed.add(url)
           the_page = Response.read()
           #url parser
           soup = BeautifulSoup(the_page,'html.parser')
           p=soup.findAll('p')
           for ps in p:
               psStr = str(ps.get_text().encode('utf-8'))
               psStr = re.sub(r'[^a-zA-Z\s\n]',' ',psStr)
               if len(psStr)>2:
                  psStr = nltk.word_tokenize(psStr)
                  for ss in psStr:
                      
                      wordLemmatized = wnl.lemmatize(ss)
                      wordStemed = stem.stem(wordLemmatized)
                      tempArticle += str(wordStemed).lower()+' '
       #        print(ps.get_text())           

           if len(tempArticle) > 800:
              r += 1
              writer = open('texts/'+str(r),'w')
              writer.write(tempArticle)
              writer.close

              writer = open('labellist3','a')
              writer.write(url + '\t' + str(r) + os.linesep)
              writer.close
     #   if hasChinese(ps.get_text()):
       
     #   else:
# porter stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

print ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

print ps.stem('lying')

print ps.stem('strange')

# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

print ls.stem('lying')

print ls.stem('strange')


# regex stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)

print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped')

print rs.stem('lying')

print rs.stem('strange')
Example #48
0
	def stemmed(self,word):
		stemmer = LancasterStemmer()
		return stemmer.stem(word)
Example #49
0
def stem_document(document):
	from nltk.stem import LancasterStemmer
	stemmer = LancasterStemmer()
	return stemmer.stem(document)
# <h2>Stemming Words</h2>
# <p>Stemming is the process of removing <em>affixes</em> from a word to obtain it's root, or <em>stem</em>. For example, the stem of <strong>
#     growing</strong> is <strong>grow</strong>. </p>
# <p>Python includes 4 stemming algorithms, 3 of which are demonstrated below. The fourth, <em>Snowball</em> is for non-English languages
#     and is not covered here but is in the text </p>

# <codecell>

from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()
reg = RegexpStemmer('ing')
g = 'growing'
print 'Porter yields: ',porter.stem(g)
print 'lancaster yields: ', lancaster.stem(g)
print 'Regexp yields: ', reg.stem(g)

# <markdowncell>

# <p>The output of various words can be different between stemmers:</p>

# <codecell>

g = 'cookery'
print 'Porter yields: ',porter.stem(g)
print 'lancaster yields: ', lancaster.stem(g)
print 'Regexp yields: ', reg.stem(g)

# <markdowncell>
__author__ = "pratap"

# Porter Stemmer
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print stemmer.stem("cooking")
print stemmer.stem("cookery")

# Lancaster Stermmer
from nltk.stem import LancasterStemmer

lanc_stemmer = LancasterStemmer()
print lanc_stemmer.stem("cooking")
print lanc_stemmer.stem("cookery")
Example #52
0
class TFIDF(object):
  
  def __init__(self, tfidf_file, id2wordFile=None):
    self.model = models.TfidfModel.load(tfidf_file)
    self.stemmer = LancasterStemmer()
    self.stopwords = set([self._preprocess_word(word) for word in stopwords])
    #self.stem_model()
    print "done"    

  def _preprocess_word(self, word):
    return self.stemmer.stem(word.lower())
    #return word.lower()

  def stem(self, doc):
    return [self.stemmer.stem(word) for word in doc]

  def stem_model(self):
    print "stemming"
    new_id2word = corpora.Dictionary()
    # Create a new dicitonary with the stemmed terms and summed document frequencies
    for termid, freq in self.model.dfs.iteritems():
      stemmed_word = self.stemmer.stem(self.model.id2word[termid])
      stemmed_id = None
      if stemmed_word in new_id2word.token2id:
        stemmed_id = new_id2word.token2id[stemmed_word]
      else:
        stemmed_id = len(new_id2word.token2id)
        new_id2word.token2id[stemmed_word] = stemmed_id
        new_id2word.dfs[stemmed_id] = 0
      new_id2word.dfs[stemmed_id] += freq # add df from old dicionary
    new_id2word.num_docs = self.model.id2word.num_docs
    new_id2word.num_nnz = self.model.id2word.num_nnz
    new_id2word.num_pos = self.model.id2word.num_pos
    self.model.id2word = new_id2word
    self.model.dfs = self.model.id2word.dfs
    self.model.idfs = precompute_idfs(self.model.wglobal, self.model.dfs, self.model.num_docs)
    self.model.save('models/all_lancaster.tfidfmodel')
    print len(new_id2word)
    print "done stemming"

  def restrict_vocab(self, corpus):
    vocab = set()
    for doc in corpus:
      for idx, freq in doc:
        vocab.add(idx)
    for idx in vocab:
      dfs[idx] = self.model.dfs[idx]
      idfs[idx] = self.model.idfs[idx]
    self.model.dfs = dfs
    self.model.idfs = idfs

  def to_bow(self, doc):
    doc = [self._preprocess_word(word) for word in doc.lower().split() if word not in self.stopwords]
    return self.model.id2word.doc2bow(doc)

  def doc_similarity(self, s1, s2, pairId=None):
    # tfidf1 = self.model[self.to_bow(s1)]
    # tfidf2 = self.model[self.to_bow(s2)]
    # index = similarities.MatrixSimilarity([tfidf1],num_features=len(self.model.id2word))
    # return math.sqrt(index[tfidf2][0])*4. + 1

    tfidf1 = self.model[self.to_bow(s1)]
    tfidf2 = self.model[self.to_bow(s2)]
    common_terms = set(zip(*tfidf1)[0])&set(zip(*tfidf2)[0])
    similarity = 0.
    tfidf_total = 0.
    for term, freq in tfidf1 + tfidf2:
      if term in common_terms:
        similarity += freq
      tfidf_total += freq
    val = math.sqrt(similarity/tfidf_total)*5.
    if val < 1.: val +=1.
    return val
def LancasterTokenizer(s):
	from nltk import word_tokenize          
	from nltk.stem import LancasterStemmer
	stemmer = LancasterStemmer()
	return [stemmer.stem(t) for t in word_tokenize(s)]