def stem_words(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
Esempio n. 2
0
 def __init__(self, stemmer_type='Porter'):
     self.stemmer_type = stemmer_type
     if self.stemmer_type == 'Porter':
         self.stemmer = PorterStemmer()
     elif self.stemmer_type == 'Lancaster':
         self.stemmer = LancasterStemmer()
     else:
         raise Exception('Invalid stemmer_type = {0}'.format(stemmer_type))
Esempio n. 3
0
def porter_stemer(texto):
    text = []
    porter = PorterStemmer()
    lancaster = LancasterStemmer()
    for words in texto:
        word_porter = porter.stem(words)
        text.append(word_porter)
        word_lancaster = lancaster.stem(words)
        text.append(word_lancaster)
    return text
Esempio n. 4
0
def stemming(df):
    new_df = []
    lancaster = LancasterStemmer()
    for i in range(0, len(df)):
        tokens = word_tokenize(df['Text'][i])
        s = [lancaster.stem(word) for word in tokens]
        new_df.append([' '.join(s), df['Label'][i]])
    new_df = pd.DataFrame(new_df)
    new_df.columns = ['Text', 'Label']
    return pd.DataFrame(new_df)
Esempio n. 5
0
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stemmer = SnowballStemmer('english')
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)

    return stems
Esempio n. 6
0
 def stemWords(self, words):
     """Stem words in list of tokenized words"""
     if stemmer == "lancaster":
         stemmer = LancasterStemmer()
     elif stemmer == "snowbal":
         stemmer = SnowballStemmer()
     elif stemmer == "porter":
         stemmer = PorterStemmer()
     stems = [stemmer.stem(word) for word in words]
     return stems
Esempio n. 7
0
	def stemWords(self, words):
		"""Stem words in list of tokenized words"""
		if stemmer == "lancaster":
			stemmer = LancasterStemmer()
		elif stemmer == "snowbal":
			stemmer = SnowballStemmer()
		elif stemmer == "porter":
			stemmer = PorterStemmer()
		stems = [stemmer.stem(word) for word in words]
		return stems
Esempio n. 8
0
def WordStemming(sample):
    stem1 = LancasterStemmer()
    stem2 = PorterStemmer()
    tokenWords = word_tokenize(sample)
    #tokenWords
    stem_sentence = []
    for word in tokenWords:
        stem_sentence.append(stem1.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence), tokenWords
Esempio n. 9
0
def stem_word_list(word_list):
    """Stem word_list in list of tokenized word_list
        Keyword arguments:
            word_list: list of words
    """
    stemmer = LancasterStemmer()
    stems = []
    for word in word_list:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
Esempio n. 10
0
def stemming(word):
    # Use stemmers for removing morphological affixes from words.
    Portst = PorterStemmer()
    Landst = LancasterStemmer()
    Regst = RegexpStemmer('ing|ed')
    new = Portst.stem(word)
    if new == word:
        new = Landst.stem(word)
        if new == word:
            new = Regst.stem(word)
    return new
Esempio n. 11
0
def word_frequence(al, rank):
    lst = LancasterStemmer()
    left = [
        lst.stem(word.lower()) for word in word_tokenize(al)
        if word.lower() not in stopwords.words('english') and len(word) > 2
    ]
    final = FreqDist(left)
    sort = sorted(list(set(final.values())))
    sort = [i for i in sort[::-1]]
    for i in sort[:rank]:  #¦C¥X«e´X¦W
        print([v for v, k in final.items() if k == i], i)
Esempio n. 12
0
    def __stem_words(self, words):
        """Stem words in list of tokenized words"""
        if isinstance(words, str):
            words = words.split(' ')

        stemmer = LancasterStemmer()
        stems = []
        for word in words:
            stem = stemmer.stem(word)
            stems.append(stem)
        return stems
Esempio n. 13
0
def stemming(text):
    """does stemming"""
    words_list = nltk.word_tokenize(text)
    words_set = set()

    ps = LancasterStemmer()

    for word in words_list:
        words_set.add(ps.stem(word))

    return words_set
Esempio n. 14
0
 def process_lancaster_stop(text):
 
     text = re.sub('[^A-Za-z0-9]+', ' ',  text)
     tokens = word_tokenize(text)
     tokens = [w.lower() for w in tokens]
     stemmer = LancasterStemmer()
     tokens = [stemmer.stem(word) for word in tokens]
     table = str.maketrans('', '', string.punctuation)
     stripped = [w.translate(table) for w in tokens]
     words = [word for word in stripped if word.isalpha()]
     return words
Esempio n. 15
0
def stem(word_list):
    '''
    Returns the stemming for a lisf of word
    :param word_list: list of words
    :return: list of tuple (word, stemming)
    '''
    result = []
    lstemmer = LancasterStemmer()
    for word in word_list:
        w = lstemmer.stem(word)
        result.append(w)
    return result
Esempio n. 16
0
def string_stemmer(text):
    """
    :param text: words from email
    :return: string with stemmed words
    """
    stemmer = LancasterStemmer()
    stemmed_list = []

    for word in text.split(' '):
        stemmed_list.append(stemmer.stem(word))

    return ' '.join(word for word in stemmed_list)
 def stem_words(words):
     """
     Stem words in list of tokenized words
     :param words:
     :return:
     """
     stemmer = LancasterStemmer()
     stems = []
     for word in words:
         stem = stemmer.stem(word)
         stems.append(stem)
     return stems
Esempio n. 18
0
    def __init__(self, algorithm="TFIDF"):

        self.algorithm = algorithm
        self.stopWords = stopwords.words('english')

        self.wsTok = WhitespaceTokenizer()
        self.stemmer = LancasterStemmer()
        self.countVect = CountVectorizer()
        self.tfidfVect = TfidfVectorizer()

        self.queryData = []
        self.srcData = []
Esempio n. 19
0
def list_stemmer(word_list):
    """
    :param word_list: list of strings
    :return: list with stemmed words
    """
    stemmer = LancasterStemmer()
    ret_list = []

    for word in word_list:
        ret_list.append(stemmer.stem(word))

    return ret_list
def create_stemming(wrd_tokens):
    print("\n==================== Stemming ====================")

    p_stemmer = PorterStemmer()
    l_stemmer = LancasterStemmer()
    s_stemmer = SnowballStemmer('english')

    wrd_token_count = 0
    for wrd_token in wrd_tokens:
        wrd_token_count += 1
        if wrd_token_count < 7:
            print(p_stemmer.stem(wrd_token), l_stemmer.stem(wrd_token),
                  s_stemmer.stem(wrd_token))
def stem_words(text):
    """ combines the different forms of the verbs/adverbs/adjectives"""
    text = text.split()
    try:
        stemmer = LancasterStemmer()
    except LookupError:
        nltk.download('wordnet')

    stems = list()
    for word in text:
        stem = stemmer.stem(word)
        stems.append(stem)
    return ' '.join(stems)
def prepare_embedding_matrix(max_words, embedding_dim, word_index, embeddings_index, hparams, lower_only=False):
    print('Preparing embedding matrix...')
    np.random.seed(hparams['random_state'])
    porter = PorterStemmer()
    lancaster = LancasterStemmer()
    snowball = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    count = 0
    embedding_matrix = np.zeros((max_words, embedding_dim))
    random_vector = np.random.random(embedding_dim)
    for word, i in word_index.items():
        if i >= max_words:
            continue
        
        if word in embeddings_index and word.lower() not in embeddings_index:
            embeddings_index[word.lower()] = embeddings_index[word]

        embedding_vector = embeddings_index.get(word.lower()) if lower_only else embeddings_index.get(word)

        # https://www.kaggle.com/wowfattie/3rd-place
        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.lower())

        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.upper())

        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.capitalize())

        if embedding_vector is None:
            embedding_vector = embeddings_index.get(porter.stem(word)) 

        if embedding_vector is None:
            embedding_vector = embeddings_index.get(lancaster.stem(word)) 

        if embedding_vector is None:
            embedding_vector = embeddings_index.get(snowball.stem(word)) 

        if embedding_vector is None:
            embedding_vector = embeddings_index.get(lemmatizer.lemmatize(word))
         

        if word == hparams['tokenizer_oov_token'] or embedding_vector is None:
            embedding_matrix[i] = random_vector
        else:    
            embedding_matrix[i] = embedding_vector
            count += 1
        
    print('Word vectors coverage:', count / max_words)
    print('Embedding matrix shape:', embedding_matrix.shape)
    return embedding_matrix
Esempio n. 23
0
    def preprocess_text(df):

        stemmer = LancasterStemmer()
        lemmatizer = WordNetLemmatizer()
        p = inflect.engine()
        import pdb
        pdb.set_trace()
        # remove special characters
        df['text'].apply(lambda x: re.sub("(\\W)+", " ", x))

        # remove punctuation
        df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

        # tokenize
        df['text'].apply(lambda x: nltk.word_tokenize(x))

        # to lower case
        df['text'].apply(lambda x: [word.lower() for word in x])

        # filter special characters
        df['text'].apply(lambda x: [word.lower() for word in x])

        # filter stopwords
        df['text'].apply(
            lambda x:
            [item for item in x if item not in stopwords.words('german')])

        # remove punctuation
        df['text'].apply(
            lambda x:
            [re.sub(r'[^\w\s]', '', word) for word in x if word != ''])

        # Remove non-ASCII characters from list of tokenized words
        df['text'].apply(lambda x: [
            unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').
            decode('utf-8', 'ignore') for word in x
        ])

        # Replace all interger occurrences in list of tokenized words with textual representation
        df['text'].apply(
            lambda x:
            [p.number_to_words(word) for word in x if word.isdigit()])

        # stemming
        df['text'].apply(lambda x: [stemmer.stem(word) for word in x])

        # lemmatizing
        df['text'].apply(
            lambda x: [lemmatizer.lemmatize(word, pos='v') for word in x])

        return df
Esempio n. 24
0
def stemming(text, method='lancaster'):

    result = []

    if method == 'lancaster':
        stemmer = LancasterStemmer()
    elif method == 'porter':
        stemmer = PorterStemmer()
    word_pattern = re.compile("(?:[a-zA-Z]+[-–’'`ʼ]?)*[a-zA-Z]+[’'`ʼ]?")
    words = word_pattern.findall(text)
    for word in words:
        word_stemmed = stemmer.stem(word)
        result.append(word_stemmed)
    return result
Esempio n. 25
0
    def set_content_based_on_intents(self) -> (list(), list()):
        all_words = []
        self.steammer = LancasterStemmer()
        for intent in self.intents:
            for pattern in intent["patterns"]:
                words = nltk.word_tokenize(pattern)
                words = [self.steammer.stem(word.lower()) for word in words]
                all_words.extend(words)
                self.sentences.append(words)
                self.tags.append(intent["intent"])
                words = []

        self.words_bag = sorted(list(set(all_words)))
        self.intents_bag = sorted(list(set(self.tags)))
def __getStems(words):
    #########################################################################################
    # This method returns stemmed words by applying nltk.LancasterStemmer.
    #########################################################################################
    try:
        if words:
            stemmer = LancasterStemmer()
            return [stemmer.stem(word) for word in words if word != ""]
        return words  # return words as is without any changes
    except:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        err = "Error occurred while getting stems of the words '{0}'. Error is: {1}; {2}".format(
            " ".join(words), str(exc_type), str(exc_value))
        raise Exception(err)
Esempio n. 27
0
    def stem(self, input_text):
        tokenizer = RegexpTokenizer("\s+", gaps=True)
        stemmed_text = []
        lemmatizer = WordNetLemmatizer()
        stemmer = LancasterStemmer()
        text = tokenizer.tokenize(str(input_text))
        filtered_text = self.stopword(text)
        for word in filtered_text:
            if word.isalpha():
                stemmed_text.append(stemmer.stem(word).lower())

        " ".join(stemmed_text)

        return stemmed_text
Esempio n. 28
0
def main():
    lancaster = LancasterStemmer()
    porter = PorterStemmer()
    snowball = SnowballStemmer("english")

    word_list = ["friend", "friendship", "friends",
                 "friendships", "stabil", "destabilize",
                 "misunderstanding", "universe", "universal",
                 "university", "union",
                 "railroad", "moonlight", "football", "going",
                 "education"]
    print("{0:20}{1:20}{2:20}{3:20}".format("Word", "Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer"))
    for word in word_list:
        print("{0:20}{1:20}{2:20}{3:20}".format(word, porter.stem(word), lancaster.stem(word), snowball.stem(word)))
async def process_text_nltk_handle(request):
    logger = logging.getLogger("nltk-request")
    logger.info("Process text [NLTK] request")

    request_text = await request.read()
    raw_data = request_text.decode('utf8')

    raw_data_modified = ''

    for raw_string in raw_data.split('.'):
        buff = raw_string.strip().replace(r"\n", "")
        if len(buff) > 1:
            raw_data_modified += buff + '.'

    a = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(raw_data_modified.lower())
    tokens = [x for x in tokens if x not in a]
    result = {"tokens": tokens}

    fdist = FreqDist(tokens)
    result["fdist"] = fdist

    fdist10 = fdist.most_common(10)
    result["fdist10"] = fdist10

    porter_stemmer_result = []
    lancaster_stemmer_result = []
    pst = PorterStemmer()
    lst = LancasterStemmer()
    for token in tokens:
        porter_stemmer_result.append(pst.stem(token))
        lancaster_stemmer_result.append(lst.stem(token))
    result["porter_stemmer_result"] = porter_stemmer_result
    result["lancaster_stemmer_result"] = lancaster_stemmer_result

    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for token in tokens:
        lemmas.append(lemmatizer.lemmatize(token))
    result["lemmas"] = lemmas

    result["pos_tags"] = nltk.pos_tag(tokens)

    result["ner"] = ne_chunk(result["pos_tags"])

    return web.Response(text=json.dumps(result),
                        headers={"Access-Control-Allow-Origin": "*"},
                        content_type="application/json")
Esempio n. 30
0
	def obtain_the_tags_frequency(self):

		b = TextBlob(self.objectiveoutstring)

		# b, pass frequecy, pass Lancaster stemming, pass 850, pass wordnet, pass nn/vb
		# save dictionary

		wordtags = b.correct().tags

		
		# The first frequency saving
		stemmer = LancasterStemmer()
		first_frequecy = {}
		for item in wordtags:
			temp = stemmer.stem(item[0])
			if Word(temp).synsets and temp not in self.basicwords:
				if temp not in first_frequecy:
					first_frequecy[temp] = []
					first_frequecy[temp].append(item[1])
				else:
					try:
						first_frequecy[temp].append(item[1])
					except:
						print(temp)
						print(item)
			elif temp in self.basicwords:
				self.basicoverlap += 1
			else:
				self.weirdwords.append(item[0])




		
		for key in first_frequecy.keys():
			length = len(first_frequecy[key])
			tags_length = len(set(first_frequecy[key]))
			t1 = 0
			t2 = 0
			t3 = 1
			for it in set(first_frequecy[key]):
				if 'NN' in it:
					t1 = 1
					t3 = 0
				else:
					if 'VB' in it:
						t2 = 1
						t3 = 0
			self.second_frequency.append((key, length, tags_length, t1, t2, t3))
Esempio n. 31
0
def stem_lem_words(word_list, engine="word_net"):
    """
    :param word_list: a list of words
    :param engine: variable to choose between available stemmer
    :return: stemmed/lemmatized list of words
    """
    if engine == "porter":
        porter = PorterStemmer()
        return [porter.stem(word) for word in word_list]
    elif engine == "lancaster":
        lancaster = LancasterStemmer()
        return [lancaster.stem(word) for word in word_list]
    elif engine == "word_net":
        lem = WordNetLemmatizer()
        return [lem.lemmatize(word) for word in word_list]
Esempio n. 32
0
    def run(self):
        super().run()

        # stem words in input file
        stemmer = LancasterStemmer()

        output = open(self.output, 'w+')
        with open(self.input, mode='r') as input:
            for line in input:
                for word in line.split():
                    output.write(stemmer.stem(word) + ' ')

                output.write('\n')

        output.close()
Esempio n. 33
0
def stem_and_rem_stopwords(documents:list, additional_stopwords: list = []):
    """Returns a list of documents that have been stemmed and
    had stopwords removed.
    """
    s_words = set(stopwords.words('english') + additional_stopwords)
    stemmer = LancasterStemmer()

    processed_documents =[]
    for document in documents:
        tokens = document.split()
        processed_documents.append(
            ' '.join([stemmer.stem(token) for token in tokens if token not in s_words])
            )

    return processed_documents
Esempio n. 34
0
class Baseline(object):
  def __init__(self):
    self.stemmer = LancasterStemmer()
    self.stopwords = set([self.stemmer.stem(word) for word in stopwords])

  def stem(self, doc):
    return [self.stemmer.stem(word) for word in doc]

  def doc_similarity(self, s1, s2, pairId=None):
    s1 = s1.lower().split()
    s2 = s2.lower().split()
    s1 = self.stem(s1)
    s2 = self.stem(s2)
    s1 = set(s1) - self.stopwords
    s2 = set(s2) - self.stopwords
    return float(len(s1.intersection(s2)))/((len(s1)+len(s2)))
Esempio n. 35
0
class Tokenizer():

    def __init__(self):
        self.stemmer = LancasterStemmer()

    def __call__(self, text):
        return [self.stemmer.stem(token) for token in word_tokenize(text)]
Esempio n. 36
0
class Tokenizer(object):
    def __init__(self):
        self.tok = RegexpTokenizer(r'some_regular_expression')
        self.stemmer = LancasterStemmer()

    def __call__(self, doc):
        return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)]
    def __init__(self, analyze_attributes_from_schema,
                 does_process_similar_words, precision):
        # Load medium or big corpus of english words.
        self.nlp_corpus = spacy.load("en_core_web_md")
        self.analyze_attributes_from_schema = analyze_attributes_from_schema
        self.precision = precision
        self.does_process_similar_words = does_process_similar_words

        # Aggresive stemming preferred.
        self.lancester = LancasterStemmer()

        # Use wordnet for lemmas.
        self.wordnet_lemmatizer = WordNetLemmatizer()

        # Managers used throughout the program.
        self.noise_manager = NoiseManager()
        self.trait_extractor = TraitExtractor()
        self.trait_analyzer = TraitAnalyzer()

        # Extract traits from CDM Schema documents folder.
        self.trait_list = self.trait_extractor.extract_traits(
            'CDM.SchemaDocuments/', self.trait_files)

        # The list of stemmed trait features.
        self.stem_traits = self.trait_analyzer.stem_traits(
            self.trait_list, self.lancester, self.wordnet_lemmatizer,
            self.noise_manager)
Esempio n. 38
0
class TFIDF:

    def __init__(self):
        self.pickle_docs = "tfidf_pickle_docs"
        self.pickle_corpus = "tfidf_pickle_corpus"
        self.lan = LancasterStemmer()
        self.construct()
        #print sorted(self.words.iteritems(), key = operator.itemgetter(1), reverse=True)[:20]

    def clean(self, word):
        '''cleans a word or returns None if it should not be considered'''
        word = word.strip(string.punctuation)
        word = self.lan.stem(word)
        return word
    
    def construct(self):
        corpus = {}

        # Check to see if we should simply load a pickle
        if os.path.isfile(self.pickle_docs):
            with open(self.pickle_docs) as docs_file:
                current_doclist = pickle.load(docs_file)
                if os.listdir('articles/') == current_doclist:
                    # current article list is the same as pickled article list
                    # so we want to just load the stored pickled corpus data
                    with open(self.pickle_corpus) as corpus_file:
                        self.words = pickle.load(corpus_file)
                        self.n = len(current_doclist)
                        return
        
        # If we don't load a pickle, build the corpus from articles/ dir
        num_docs = 0.0
        for file_name in os.listdir('articles/'):
            num_docs += 1
            doc = {}
            with open("articles/" + file_name) as article:
                for line in article:
                    for word in tokenize(line, "word", return_spans=False):
                        word = self.clean(word)
                        doc[word] = 1
            for key in doc.keys():
                corpus[key] = corpus.get(key, 0) + 1

        self.words = corpus
        self.n = num_docs

        print "Pickling a new TFIDF corpus"
        # pickle corpus and document list
        with open(self.pickle_docs, "w") as docs_file:
            pickle.dump(os.listdir('articles/'), docs_file)
        with open(self.pickle_corpus, "w") as corpus_file:
            pickle.dump(self.words, corpus_file)

    def weight(self, word, count, debug=False):
        if debug:
            return (word, count, self.words.get(word, 1))
        return  count * math.log(self.n / self.words.get(word, 1))
Esempio n. 39
0
class StemTokenizer(object):
    def __init__(self, stemmer_type='Porter'):
        self.stemmer_type = stemmer_type
        if self.stemmer_type == 'Porter':
            self.stemmer = PorterStemmer()
        elif self.stemmer_type == 'Lancaster':
            self.stemmer = LancasterStemmer()
        else:
            raise Exception('Invalid stemmer_type = {0}'.format(stemmer_type))

    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]
    def word_refiner(*args):
        Portst = PorterStemmer()
        Landst = LancasterStemmer()
        Regst = RegexpStemmer('ing|ed|ly|lly')
        args = [i for i in args if isinstance(i, unicode)]

        for w in map(str, args):
            if w in dic1:
                yield w
            else:
                st1 = Portst.stem(w)
                if st1 in dic1:
                    yield st1
                else:
                    st2 = Landst.stem(w)
                    if st2 in dic1:
                        yield st2
                    else:
                        st3 = Regst.stem(w)
                        if st3 in dic1:
                            yield st3
                        else:
                            yield w
Esempio n. 41
0
class LancasterTokenizer(object):
    def __init__(self):
        self.ls = LancasterStemmer()
        self.rx = RegexpTokenizer(r"(?u)\b\w\w+\b")

    def isNumber(self, s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    def __call__(self, doc):
        return [self.ls.stem(t) for t in self.rx.tokenize(doc) if not self.isNumber(t)]
Esempio n. 42
0
def preprocess(sentence):
    output_list = []

    #CASE FOLDING [NOT COMPLETE]
    sentence = sentence.lower()

    #DATA CLEANING
    sentence = sentence.replace('[https://]?[t.co/]?','')
    sentence = sentence.replace('@','')
    sentence = sentence.replace('[#]?','')
    sentence = sentence.replace('[RT]?','')
    sentence = sentence.replace(',','')
    sentence = sentence.replace('!','')
    sentence = sentence.replace('?','')
    sentence = sentence.replace('.','')
    sentence = sentence.replace('\'','')
    sentence = sentence.replace('\"','')
    sentence = sentence.replace(':','')

    #REMOVE REPEATED CHARS
    #sentence = re.sub(r'(\w)\1+', r'\1', sentence)

    #TOKENIZE
    tt = TweetTokenizer()
    temp = tt.tokenize(sentence)

    #REMOVE STOP WORDS
    stop = stopwords.words('english')

    #STEMMING
    ls = LancasterStemmer()
    newtemp = [eachword for eachword in temp if eachword not in stop]
    for eachword in newtemp:
        output_list.append(ls.stem(eachword))

    return output_list
Esempio n. 43
0
class Tokenizer():
    """
    Tokenizes and stems text using NLTK libraries
    """

    def __init__(self):
        """
        Constructs a tokenizer object
        """
        self.stemmer = LancasterStemmer()

    def __call__(self, text):
        """
        Tokenizes text

        :param text: the text to tokenize
        :type text: str or unicode
        :return: a list of tokens
        :rtype: list of (str or unicode)
        """
        return [self.stemmer.stem(token) for token in word_tokenize(text)]
Esempio n. 44
0
 def __init__(self):
     self.pickle_docs = "tfidf_pickle_docs"
     self.pickle_corpus = "tfidf_pickle_corpus"
     self.lan = LancasterStemmer()
     self.construct()
Esempio n. 45
0
def stem_document(document):
	from nltk.stem import LancasterStemmer
	stemmer = LancasterStemmer()
	return stemmer.stem(document)
Esempio n. 46
0
class TFIDF(object):
  
  def __init__(self, tfidf_file, id2wordFile=None):
    self.model = models.TfidfModel.load(tfidf_file)
    self.stemmer = LancasterStemmer()
    self.stopwords = set([self._preprocess_word(word) for word in stopwords])
    #self.stem_model()
    print "done"    

  def _preprocess_word(self, word):
    return self.stemmer.stem(word.lower())
    #return word.lower()

  def stem(self, doc):
    return [self.stemmer.stem(word) for word in doc]

  def stem_model(self):
    print "stemming"
    new_id2word = corpora.Dictionary()
    # Create a new dicitonary with the stemmed terms and summed document frequencies
    for termid, freq in self.model.dfs.iteritems():
      stemmed_word = self.stemmer.stem(self.model.id2word[termid])
      stemmed_id = None
      if stemmed_word in new_id2word.token2id:
        stemmed_id = new_id2word.token2id[stemmed_word]
      else:
        stemmed_id = len(new_id2word.token2id)
        new_id2word.token2id[stemmed_word] = stemmed_id
        new_id2word.dfs[stemmed_id] = 0
      new_id2word.dfs[stemmed_id] += freq # add df from old dicionary
    new_id2word.num_docs = self.model.id2word.num_docs
    new_id2word.num_nnz = self.model.id2word.num_nnz
    new_id2word.num_pos = self.model.id2word.num_pos
    self.model.id2word = new_id2word
    self.model.dfs = self.model.id2word.dfs
    self.model.idfs = precompute_idfs(self.model.wglobal, self.model.dfs, self.model.num_docs)
    self.model.save('models/all_lancaster.tfidfmodel')
    print len(new_id2word)
    print "done stemming"

  def restrict_vocab(self, corpus):
    vocab = set()
    for doc in corpus:
      for idx, freq in doc:
        vocab.add(idx)
    for idx in vocab:
      dfs[idx] = self.model.dfs[idx]
      idfs[idx] = self.model.idfs[idx]
    self.model.dfs = dfs
    self.model.idfs = idfs

  def to_bow(self, doc):
    doc = [self._preprocess_word(word) for word in doc.lower().split() if word not in self.stopwords]
    return self.model.id2word.doc2bow(doc)

  def doc_similarity(self, s1, s2, pairId=None):
    # tfidf1 = self.model[self.to_bow(s1)]
    # tfidf2 = self.model[self.to_bow(s2)]
    # index = similarities.MatrixSimilarity([tfidf1],num_features=len(self.model.id2word))
    # return math.sqrt(index[tfidf2][0])*4. + 1

    tfidf1 = self.model[self.to_bow(s1)]
    tfidf2 = self.model[self.to_bow(s2)]
    common_terms = set(zip(*tfidf1)[0])&set(zip(*tfidf2)[0])
    similarity = 0.
    tfidf_total = 0.
    for term, freq in tfidf1 + tfidf2:
      if term in common_terms:
        similarity += freq
      tfidf_total += freq
    val = math.sqrt(similarity/tfidf_total)*5.
    if val < 1.: val +=1.
    return val
Esempio n. 47
0
 def __init__(self):
     self.wnl = LancasterStemmer()
def LancasterTokenizer(s):
	from nltk import word_tokenize          
	from nltk.stem import LancasterStemmer
	stemmer = LancasterStemmer()
	return [stemmer.stem(t) for t in word_tokenize(s)]
Esempio n. 49
0
class LancasterTokenizer(object):
        def __init__(self):
            self.wnl = LancasterStemmer()
        def __call__(self, doc):
            return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)]
Esempio n. 50
0
 def __init__(self):
     self.ls = LancasterStemmer()
     self.rx = RegexpTokenizer(r"(?u)\b\w\w+\b")
__author__ = "pratap"

# Porter Stemmer
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print stemmer.stem("cooking")
print stemmer.stem("cookery")

# Lancaster Stermmer
from nltk.stem import LancasterStemmer

lanc_stemmer = LancasterStemmer()
print lanc_stemmer.stem("cooking")
print lanc_stemmer.stem("cookery")
Esempio n. 52
0
 def __init__(self):
     """
     Constructs a tokenizer object
     """
     self.stemmer = LancasterStemmer()
import nltk
from nltk.stem import LancasterStemmer
stemmerlan=LancasterStemmer()
print(stemmerlan.stem('working'))
print(stemmerlan.stem('happiness'))
Esempio n. 54
0
            Tokens2.append(w)
    #fix ascii again, don't know whats happening here!
    for w in Tokens2:
        for char in w:
            if ord(char) > 128:
                w.
    #stemming
    Tokens3 = []
    for w in Tokens2:
        Tokens3.append(lanStem.stem(w))
    return Tokens3
    

test = clean(tesSum1)

#read files
tesSum1 = teslaSummary1.read()
tesSum5 = teslaSummary5.read()

#stpWrds = set(stopwords.words("english"))
#print stpWrds

sum1SentTok = sentTok(tesSum1)
sum2SentTok = sentTok(tesSum2)

sum1WordTok = wordTok(tesSum1)
sum2WordTok = wordTok(tesSum2)

for w in sum1WordTok:
tryAgain + LancasterStemmer.stem('stemming')
print remove_repeated_characters(sample_sentence_tokens)    


# porter stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

print ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

print ps.stem('lying')

print ps.stem('strange')

# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

print ls.stem('lying')

print ls.stem('strange')


# regex stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)

print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped')

print rs.stem('lying')
Esempio n. 56
0
	def stemmed(self,word):
		stemmer = LancasterStemmer()
		return stemmer.stem(word)
Esempio n. 57
0
 def __init__(self, tfidf_file, id2wordFile=None):
   self.model = models.TfidfModel.load(tfidf_file)
   self.stemmer = LancasterStemmer()
   self.stopwords = set([self._preprocess_word(word) for word in stopwords])
   #self.stem_model()
   print "done"    
# -*- coding: utf-8 -*-
# <nbformat>2</nbformat>

# <markdowncell>

# <h2>Stemming Words</h2>
# <p>Stemming is the process of removing <em>affixes</em> from a word to obtain it's root, or <em>stem</em>. For example, the stem of <strong>
#     growing</strong> is <strong>grow</strong>. </p>
# <p>Python includes 4 stemming algorithms, 3 of which are demonstrated below. The fourth, <em>Snowball</em> is for non-English languages
#     and is not covered here but is in the text </p>

# <codecell>

from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()
reg = RegexpStemmer('ing')
g = 'growing'
print 'Porter yields: ',porter.stem(g)
print 'lancaster yields: ', lancaster.stem(g)
print 'Regexp yields: ', reg.stem(g)

# <markdowncell>

# <p>The output of various words can be different between stemmers:</p>

# <codecell>

g = 'cookery'
print 'Porter yields: ',porter.stem(g)
print 'lancaster yields: ', lancaster.stem(g)
Esempio n. 59
0
 def __init__(self):
   self.stemmer = LancasterStemmer()
   self.stopwords = set([self.stemmer.stem(word) for word in stopwords])
Esempio n. 60
0
       return False
def isMedium(url):
    print(url)
    try:
        a = re.search(ur'^http://thenextweb.+?',url)
    except UnicodeEncodeError:
           print('encode error')
    else:
           print('encode success')
    if a:
         return True
    else:
         return False


stem = LancasterStemmer()
wnl = WordNetLemmatizer()
wordStemed = ''
wordLemmatized = ''
urlBegin = 'http://thenextweb.com/section/tech/'
urlUnused.add(urlBegin)
urlUnused.add('http://thenextweb.com/apple/2016/10/13/apple-in-talks-with-australian-company-to-bring-dynamic-keyboard-tech-to-macbooks/')
i=0
r=800
#load url
while r<1200:
    tempArticle = ''
    url = urlUnused.pop()
   # print url
    if url not in urlUsed:
       headers = {'User-Agent' : user_agent}