Ejemplo n.º 1
0
def des_extrect():
    filename_list = []
    file_stopwords = file('stopwords.txt', "r")
    stopwords = [line.strip() for line in file_stopwords.readlines()]  
    for file_name in os.listdir(DESCRIPTION_DIR):
        filename_list.append(file_name) 
    for filename in filename_list:
        path =  os.path.join(DESCRIPTION_DIR, filename)
        fr = file(path, 'r')
        fw = file(filename+'.des', 'w')
        soup = BeautifulSoup(fr.read())
        docs = soup.findAll('doc')
        for doc in docs:
            content = str(doc['title'] + doc.snippet.text)
            content =  re.sub("[\.\@\,\:\;\!\?\(\)]".decode("utf8"), "".decode("utf8"),content)
            stemmer = SnowballStemmer('english')
            content = content.split()
            pro_content = ''
            for w in content: 
                w = stemmer.stem(w)
                #去停用词
                if w not in stopwords:
                    pro_content += w + ' '
            fw.write(doc['rank'] + ' ' +pro_content+'\n')
        fw.close()
        fr.close()
def text_token_data_generator():
    global id_text_index_map
    translation_table = string.maketrans(
        string.punctuation + string.uppercase, " " * len(string.punctuation) + string.lowercase
    )
    snowball_stemmer = SnowballStemmer("english")
    for f in glob.glob("json/text/*.json"):
        for line in open(f).readlines():
            extract_row = json.loads(line)
            id_text_index_map[extract_row["file_id"]] = len(id_text_index_map)
            visible_text = extract_row["visible_text"].encode("ascii", "ignore")
            visible_text = visible_text.translate(translation_table)
            visible_text = [
                snowball_stemmer.stem(word)
                for word in visible_text.split()
                if word not in ENGLISH_STOP_WORDS and len(word) > 1
            ]
            title = extract_row["title"].encode("ascii", "ignore")
            title = title.translate(translation_table)
            title = [
                "t^{}".format(snowball_stemmer.stem(word))
                for word in title.split()
                if word not in ENGLISH_STOP_WORDS and len(word) > 1
            ]
            visible_text.extend(title)
            yield " ".join(visible_text)
Ejemplo n.º 3
0
def normalized_token(token):
    """
    Use stemmer to normalize the token.
    建图时调用该函数,而不是在file_text改变词形的存储
    """
    stemmer = SnowballStemmer("english") 
    return stemmer.stem(token.lower())
Ejemplo n.º 4
0
class ModelBuilder():

    def __init__(self):
        self.model = {}
        self.stemmer = SnowballStemmer('english')

    def build(self):
        with open('data/candidate_synonyms.txt') as f:
            all_words = f.read().split('\n')
            for words in all_words:
                if words:
                    word, similar = words.split(',')
                    word, similar = self.stemmer.stem(word), self.stemmer.stem(similar)
                    if word not in self.model: self.model[word] = {}
                    self.model[word][similar] = 1
        return self

    def condense(self):
        condensed_model = {}
        for word, similars in self.model.items():
            for similar in similars:
                if self.model.get(similar, {}).has_key(word):
                    if condensed_model.has_key(word):
                        condensed_model[word].append(similar)
                    else:
                        condensed_model[word] = [similar]
        self.model = condensed_model
        return self
Ejemplo n.º 5
0
def procesar(request, identificador):
	lmtzr = WordNetLemmatizer()
	d = Documento.objects.get(id=identificador)
	
	#nltk.corpus.cess_esp.words()
	
	
	tokens = nltk.word_tokenize(d.contenido.replace('.', ' . '))
	#print tokens
	#scentence = d.contenido

	#scentence = scentence.lower() 

	words = tokens
	spanish_stemmer = SnowballStemmer('spanish')
	

	#This is the simple way to remove stop words
	important_words=[]
	for word in words:
		if word not in stopwords.words('spanish'):
		    important_words.append([word, lmtzr.lemmatize(word), spanish_stemmer.stem(word)])




	return render_to_response('templates/documentoProcesado.html', 
				{
					'original': d.contenido,
					'tokens': tokens,
					'important_words' : important_words,
					#'pos_tags': pos_tags,
					#'ne_chunks': ne_chunks.subtrees(),
				})
Ejemplo n.º 6
0
def stemmed(text,language):
    stemmer= SnowballStemmer(language)
    tas=text.split()
    text=""
    for word in tas:
        text=" ".join((text,stemmer.stem(word)))
    return text.lstrip()
def norm_corpus(document_list):
    norm_doc_list = []
    
    # lowercase
    document_list = [word.lower() for word in document_list]

    
    # remove symbols in text
    symbols = ",.?!"
    for sym in symbols:
        document_list = [word.replace(sym,'') for word in document_list]
    
    
    # loop through each string i.e. review in the column
    for doc in document_list:
        doc = nltk.word_tokenize(doc)
        
        # remove stopwords
        doc = [word for word in doc if word not in stopwords.words('english')]
        
        # stem words
        stemmer = SnowballStemmer("english")
        doc = [stemmer.stem(word) for word in doc]
        
        # make tokenised text one string
        norm_doc = " ".join(doc)
        norm_doc_list.append(norm_doc)
    
    return norm_doc_list
def frequency_analysis(input_path, output_path, stopwords=None, n_most_common=50):
	recipes = []
	with open(input_path, 'r') as f:
		for i, line in enumerate(f):
			if line == '\n':
				break
			if i == 0:
				continue  # skip header
			fields = line.split('\t')
			recipes.append(fields[1].replace("\n", ""))
	recipe_text = re.sub("[^a-z ]", "", ' '.join(recipes))
	recipe_words = re.split("\s+", recipe_text)
	stemmer = SnowballStemmer("english")
	recipe_stems = [stemmer.stem(w) for w in recipe_words]
	if stopwords is not None:
		recipe_stems = filter(None, [s for s in recipe_stems if s not in stopwords])
	top_words = Counter(recipe_stems).most_common(n_most_common)

	# write to a file
	# do a second pass of the recipe to determine how many of the documents the term is in
	freq_table = open(output_path, 'wb')
	for elt in top_words:
		doc_freq = sum([elt[0] in recipe for recipe in recipes])
		freq_table.write(','.join([str(e) for e in elt]) +','+ str(doc_freq) + '\n')
	freq_table.close()
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)
    
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)
def preprocessing(doc): #stop word as optional
        x = re.sub("[^a-zA-Z]", " ", doc) #only words
        x = x.lower().split()
        stemmer = SnowballStemmer("english") # use snowball
        stops = set(stopwords.words("english")) # set is faster than list
        x = [stemmer.stem(word) for word in x if word not in stops]
        return(x)
Ejemplo n.º 11
0
 def __call__(self, doc ):
     snowball_stemmer = SnowballStemmer('english')
 	#tokenizer = RegexpTokenizer(r'\w+')
     #words=[self.wnl.lemmatize(t) for t in word_tokenize(doc)]
     words=[snowball_stemmer.stem(t) for t in word_tokenize(doc)]
     stop_words=set(stopwords.words('english'))
     stop_words.update(self.mystops)
     stop_words=list(stop_words)
     return [i.lower() for i in words if i not in stop_words]        
Ejemplo n.º 12
0
def preprocess_tweets(tweets):
    stemmer = SnowballStemmer("english")
    stop = set(stopwords.words("english"))
    tweet_texts = [ " ".join(stemmer.stem(i) if len(i) > 1 else i
                                for i in ("".join(c for c in word if c not in string.punctuation)
                                            for word in tweet["text"].lower().split())
                                if i and i not in stop)
                    for tweet in tweets ]
    return list(set(tweet_texts))
Ejemplo n.º 13
0
def stemLem(w):
	lemmatizer = WordNetLemmatizer()
	stemmer = SnowballStemmer("english")
	#stemmer = PorterStemmer()

	lem = lemmatizer.lemmatize(w)
	if len(w) > len(lem):
		return lem
	return stemmer.stem(w)
def stemWordMatch2(question,sentence):


    question_tokens = set(nltk.word_tokenize(question))
    sentence_tokens=set(nltk.word_tokenize(sentence))

    #  Finding the match between two words from the same root  using Lancaster Stemmizer

    '''stemmer=LancasterStemmer()

    for i in sentence_tokens:
        stem_words_list.append(stemmer.stem(i))

    for i in question_tokens:
        question_words_list.append(stemmer.stem(i))

    #print 'Stem word list',stem_words_list
    #print 'Question word list', question_words_list

    stem_count=0
    for i in stem_words_list:
        #Finding the exact word match
        if i.lower() in [x.lower() for x in question_words_list]:
            #print 'Question word is',x
            #print 'Sentence word stem is :',i
            #print 'Match'
            stem_count=stem_count+6
    stem_word_match_counter.append(count)'''

    stem_word_match_counter=[]
    stem_words_list=[]
    question_words_list=[]

    #  Finding the match between two words from the same root  using Snowball Stemmizer

    snowball_stemmer = SnowballStemmer('english')

    for i in sentence_tokens:
        stem_words_list.append(snowball_stemmer.stem(i))

    for i in question_tokens:
        question_words_list.append(snowball_stemmer.stem(i))

    #print 'Stem word list',stem_words_list
    #print 'Question word list', question_words_list

    stem_count=0
    for i in stem_words_list:
        #Finding the exact word match
        if i.lower() in [x.lower() for x in question_words_list]:
            #print 'Question word is',x
            #print 'Sentence word stem is :',i
            #print 'Match'
            stem_count=stem_count+6
    #print 'Stem word count match score is :', stem_count

    return stem_count
    def stem(self, content):
        import re

        original_string = content
        new_content = re.sub('[^a-zA-Z0-9\n\.]', ' ', original_string)
        words = new_content.split()
        stemmer = SnowballStemmer('english')
        singles = [stemmer.stem(wordsa) for wordsa in words]
        return (' '.join(singles))
Ejemplo n.º 16
0
def stemmed_top_user_words(usertxt, num=10):
	wl_usertxt = word_tokenize(usertxt.lower())
	num = min(num, len(wl_usertxt))

	snowball_stemmer = SnowballStemmer("english")
	stemmed_fl_usertxt = [snowball_stemmer.stem(w) for w in wl_usertxt if (len(w)>4 and w not in ewl)]
	fd_user_ls = [w[0] for w in FreqDist(Text(stemmed_fl_usertxt)).most_common(num)]

	return fd_user_ls
Ejemplo n.º 17
0
def main(input_file, dbname):
    """
        Main function. Connects to a database and reads a\
        CSV with the arousal and valence. Uses the sentiment \
        library to compute the sentiment of a new.

          :param input_file: the ANEW file
          :param dbname: the name of the database

    """

    # read ANEW file
    if not os.path.exists(input_file):
        logging.error('File %s does not exist', input_file)
        sys.exit(1)
    else:
        csvfile = open(input_file, 'r')
        reader = csv.reader(csvfile, delimiter=',')
        reader.next()  # skip headers
        stemmer = SnowballStemmer('spanish')
        anew = dict([(stemmer.stem(unicode(row[2], 'utf-8')),
                      {'valence': float(row[3]),
                       'arousal': float(row[5])}) for row in reader])

    couch = couchdb.Server()
    database = couch[dbname]
    logging.info('Established connection with the db %s', dbname)

    for element in database:
        doc = database.get(element)

        comments = " ".join([comment['cleaned_summary']
                            for comment in doc['comments']])
        description = " ".join([database.get(element)['title'],
                                doc['description']])

        sentiment_comments = get_sentiment(anew, comments)
        sentiment_description = get_sentiment(anew, description)

        if sentiment_comments is not None and sentiment_description is not None:
            logging.info('%s val: %.2f - %.2f aro: %.2f - %.2f : %s',
                         doc.id, sentiment_comments[0],
                         sentiment_description[0],
                         sentiment_comments[1],
                         sentiment_description[1],
                         doc['title'])
            doc['sentiments'] = {'comments':
                                {'valence': sentiment_comments[0],
                                 'arousal': sentiment_comments[1]},
                                 'description':
                                {'valence': sentiment_description[0],
                                 'arousal': sentiment_description[1]}}
            database.save(doc)

        else:
            logging.warn('%s could not be analyzed. skiping ...',
                         database.get(element)['title'])
Ejemplo n.º 18
0
    def stem_text(self):
        '''
        Perform stemming
        '''

        stemmer = SnowballStemmer("english")
        stemmed_sents = []
        for sent in self.tok_text:
            stemmed_sents.append([stemmer.stem(tok) for tok in sent])
        self.stem_text = stemmed_sents
Ejemplo n.º 19
0
def process_spanish_owned():
    from inflector import Inflector, Spanish
    inflector = Inflector(Spanish)

    from nltk.stem import SnowballStemmer
    stemmer = SnowballStemmer("spanish")

    file_valid = open('valid_words.txt', "r")
    lines = file_valid.readlines()
    valid_words = lines[0].split(' ')
    print len(valid_words)
    file_valid.close()
    #valid_words = set(valid_words)
    owned_words = ['cúster', 'custer', 'cústers', 'custers', 'combi', 'combis', 'susana', 'villaran', 'villarán', 'castañeda']

    file = open("raw_words.txt", 'r')
    fileout = open("spanish_words_owned.txt", 'w')
    fout_sing = open("spanish_words_sing.txt", 'w')
    fout_stem = open("spanish_words_stem.txt", 'w')
    nline = 0

    for line in file:
        nline += 1
        words = line.split(' ')
        processed = []
        ini_line = True
        for word in words:
            if (word != '') & (word != '\n') & (word != 'servicio') & (word != 'servicio\n'):
                word = word.replace('\n', '')
                if (word in valid_words) | (word in owned_words):
                    processed.append(word)
                    if word != 'bus':
                        word_singular = inflector.singularize(word)
                        #word_singular = word_singular.replace(u'\xF3'.encode('utf-8'), 'o')
                    else:
                        word_singular = word
                    word_stemmed = stemmer.stem(word.decode('utf-8')).encode('utf-8')
                    if ini_line:
                        fileout.write(word)
                        fout_sing.write(word_singular)
                        fout_stem.write(word_stemmed)
                        ini_line = False
                    else:
                        fileout.write(' ' + word)
                        fout_sing.write(' ' + word_singular)
                        fout_stem.write(' ' + word_stemmed)
                    print nline, word, word_singular, word_stemmed
        fileout.write('\n')
        fout_sing.write('\n')
        fout_stem.write('\n')
    file.close()
    fileout.close()
    fout_sing.close()
    fout_stem.close()
Ejemplo n.º 20
0
def prepare_request(request, synonyms = False):
    #request = translate(request)
    request = re.sub(r"(\n)", " ", request.lower())
    request = re.sub(r"(-\n)", "", request)
    request = re.split("[^a-z0-9]", request)
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    if synonyms == True:
        request = add_synonyms([word for word in request if word not in stop_words])
    request = [stemmer.stem(word) for word in request if (word not in stop_words) & (len(word) > 1) & (len(word) < 20)]
    return ' '.join(request)
Ejemplo n.º 21
0
	def stemming(self, words):
		''' Make stem for each word in array 
		@return array of stemming words	'''

		russian_stemmer = SnowballStemmer('russian')
		stemming = list()
		for w in words:
			try:
				stemming.append(russian_stemmer.stem(w))
			except Exception, e:
				pass
def tokenize(resultList1):
    entrada=[]
    for i in range(0,len(resultList1)):
        sentence=resultList1[i]
        tokens = word_tokenize(sentence)
        filtered_words = [w for w in tokens if not w in stopwords.words('spanish')]

        stemmer = SnowballStemmer('spanish')
        for i in filtered_words:
            entrada.append( stemmer.stem(i))
    return entrada
def tokenize(resultList1):
    entrada=[]
    tokens = word_tokenize(resultList1)
    filtered_words = [w for w in tokens if not w in stopwords.words('spanish')]

    stemmer = SnowballStemmer('spanish')
    for i in filtered_words:
        stri = unicode(i,errors='replace')
        entrada.append(stemmer.stem(stri))

    return entrada
Ejemplo n.º 24
0
def proc_text(text):
    s = remove_punctuation(text)
    ls = word_tokenize(s)
    
    # remover stop words
    sw = set(stopwords.words('spanish'))
    ls = filter(lambda x: x not in sw, ls)

    # lematizar
    stemmer = SnowballStemmer('spanish')
    ls = map(lambda x: stemmer.stem(x), ls)
    return ls
Ejemplo n.º 25
0
def asr_to_bow(asr_file_path, vocab, dfs):
    stemmer = SnowballStemmer('english')
    vec = [0 for i in range(len(vocab))]
    for line in open(asr_file_path):
        word = line.split()[4]
        word = stemmer.stem(word)
        if word not in vocab:
            continue
        tid = vocab[word]
        vec[tid] += 1
    for i in range(len(vec)):
        vec[i] *= math.log(883.0/dfs[i])
    return vec
Ejemplo n.º 26
0
def main():
    from nltk.stem import SnowballStemmer # Imported to perform stemming on the data
    stemmer = SnowballStemmer('english')
    stop_words = stopwords.words("english")
    for line in sys.stdin:
        line = line.strip()
        id,label,review = line.split('||') # Separates each line into id,label,review
        html_strip = BeautifulSoup(review,'html.parser')
        words = re.sub("[^a-zA-Z]"," ",html_strip.get_text() )
        words = words.split()
        words = [w.lower() for w in words if w.lower() not in stop_words] #collecting words which are not stop words
        words = [stemmer.stem(word) for word in words]
        print '%s\t%s\t%s' % (label,id,' '.join(words)) # Mapper output with Label as key and the rest are values
Ejemplo n.º 27
0
def wordnet_sim(query, db):
    """
    This function imlements simple wordnet definition lookup and compares it
    with a different block of text. For every word match between the definition
    token and text token doc receives +1.

    INPUT:
    query  --  string that represents user query expanded with word net defs
    db  --  dict representation of database xml file

    OUTPUT:
    maxdoc  --  the document with the highest score
    """
    # print('QUERY:', query)
    # initializing SnowballStemmer from nltk
    sst = SnowballStemmer("english")
    # taking stopwords from nltk
    stop = stopwords.words("english")
    # creating translation table to remove punctuation
    transnone = {ord(c): None for c in string.punctuation}
    # first we remove any punctuation and concatenate specific nodes into one
    query_nopunct = query.lower().translate(transnone)
    query_stems = [sst.stem(token) for token in query_nopunct.split() if token not in stop]
    doc_scores = defaultdict(float)
    for doc in db:
        for block, text in db[doc].items():
            # normalize block text
            if not text:
                continue
            text_nopunct = text.lower().translate(transnone)
            text = [sst.stem(t) for t in text_nopunct.split() if t not in stop]
            if len(text) == 0:
                text += " "
            # here we can finetune the block score multiplicators
            # some blocks are more important than the others
            if block == "description":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text) * 2
            elif block == "trivia":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text) * 0.5
            elif block == "history":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text) * 0.5
            elif block == "comments":
                for s in query_stems:
                    doc_scores[doc] += text.count(s) / len(text)
    maxdoc = max(doc_scores, key=lambda x: doc_scores[x])
    debug = sorted([(k, v) for k, v in doc_scores.items()], key=lambda x: x[1])
    return (debug, maxdoc)
Ejemplo n.º 28
0
def clean_data(data):
	punctuations = list(string.punctuation)
	data = data.replace("\n"," ").replace(":", " ").replace(",","").replace(".","").replace("'s","").replace("?","")
	stemmer = PorterStemmer()
	stemmer2 = SnowballStemmer('english')
	tokenizer = RegexpTokenizer(r'\w+')
	tokenizer.tokenize(data)
	ndata1 = list(mysplit(data))
	ndata1 = [[stemmer.stem(xi) for xi in y.split(" ")] for y in ndata1] 
	ndata1 = [[stemmer2.stem(xi) for xi in y] for y in ndata1]
	ndata = [x for x in ndata1 if not x == ":"]
	ndata = [filter(None, x) for x in ndata]
	ndata = [x for x in ndata if x != []]
	return ndata
Ejemplo n.º 29
0
Archivo: views.py Proyecto: ajm/pulp
def get_stems(articles) :
    stems = collections.defaultdict(list)

    stopwords = get_stop_words()
    stemmer = SnowballStemmer('english')

    for i in articles :
        for word,stem in [ (word,stemmer.stem(word)) for word in clean_text(i.title + ' ' + i.abstract).split() if word not in stopwords ] :
            if stem not in stems[i.id] :
                stems[i.id].append(stem)

    for k in stems :
        stems[k].sort()

    return dict(stems)
Ejemplo n.º 30
0
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    text = " ".join(text)
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)
Ejemplo n.º 31
0
import argparse
import numpy as np
import pandas as pd
import math
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine

from sentiment_sampling import linear_svm
from utils import noise_generator
from tqdm import tqdm
from random import random, choice
from six.moves import cPickle
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
stop = set(stopwords.words('english'))
snowball_stemmer = SnowballStemmer("english")


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("-m",
                        "--mode",
                        help="random, word2vec, robust",
                        type=str,
                        default="random")
    parser.add_argument("-s",
                        "--save-dir",
                        help="directory with stored robust model",
                        type=str,
                        default="save")
    parser.add_argument("-w",
Ejemplo n.º 32
0
import pickle 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
from matplotlib import figure

from sklearn.metrics import accuracy_score, fbeta_score,classification_report
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words("english")

from nltk.stem import SnowballStemmer
ss = SnowballStemmer("english")

data = pd.read_csv('C:/Users/user/Downloads/emails.csv')
data.shape
data.columns
data = data.iloc[0:10000,3:5]
data.describe()
stop

data.groupby('Class').describe().T

data["Class"].value_counts().plot(kind = 'pie', explode = [0,0.1], figsize = (6,6), autopct = '%1.2f%%')
plt.ylabel("Abusive vs Non Abusive")
plt.legend(["Abusive", " Non Abusive"])
plt.show()
Ejemplo n.º 33
0
def processCluster(Dir):
    global senlist
    global toklist
    global senvec
    senlist = []
    toklist = []
    filelist = os.listdir(Dir)
    articles = ""
    for fil in filelist:
        with open(Dir + "/" + fil) as f:
            text = f.read()
            if '<text>' in text:
                res_tr = r'<text>(.*?)</text>'
                m_tr = re.findall(res_tr, text, re.S | re.M)
                text = m_tr[0]
            articles += " " + text
    articles = filterDoc(articles)
    senStart = []
    senEnd = []
    lenth = len(articles)
    isStart = True
    for i in range(lenth):
        if isStart and articles[i] != ' ':
            senStart.append(i)
            isStart = False
        if articles[i] == '.' or articles[i] == '?' or articles[i] == '!':
            senEnd.append(i)
            isStart = True
    for i in range(len(senEnd)):
        senlist.append(articles[senStart[i]:senEnd[i] + 1])
    stemmer = SnowballStemmer("english")
    for s in senlist:
        toklist.append(s.split(' '))
    tmplist = []
    siglist = ['.', ':', '?', '!', "'s", '"']
    for s in senlist:
        s = s.lower()
        for sig in siglist:
            s = s.replace(sig, '')
        s = s.split(' ')
        tempsen = [stemmer.stem(w) for w in s]
        sen = ""
        for w in tempsen:
            sen += w + " "
        tmplist.append(sen)

    vectorizer = TfidfVectorizer(stop_words='english')
    senvec = vectorizer.fit_transform(tmplist)

    #toklist = vectorizer.inverse_transform(toklist)
    senvec = senvec.toarray()
    print list(senvec[1])
    print len(senvec[1])
    '''
    with open("test",'w') as fw:
        fw.write(str(len(senlist)))
        fw.write('\n')
       # fw.write(str(len(toklist)) + '\n')        
        fw.write(str(toklist) + '\n')
    '''
    return (senlist, toklist, senvec)
Ejemplo n.º 34
0
# coding: utf-8

# In[5]:

#get_ipython().system(u'pip install --upgrade pip')
#get_ipython().system(u'pip install tensorflow')
#get_ipython().system(u'pip install tflearn')

# In[9]:

# things we need for NLP
import nltk
#nltk.download('punkt') #Para solucionar el 'tokenizers/punkt/PY3/english.pickle'
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("spanish")

# things we need for Tensorflow
import numpy as np
import tflearn
import tensorflow as tf
import random

# In[10]:

# import our chat-bot intents file
import json
with open('c:\\Users\\Lucs\\Desktop\\ChatPy\\intents_esp.json') as json_data:
    intents = json.load(json_data)
#intents

# In[11]:
Ejemplo n.º 35
0
    # Convert words to lower case
    keywords = keywords.lower()
    #Tokenize document and remove all non-characters
    tokenizer = RegexpTokenizer('[a-z]\w+')
    tokened_text = tokenizer.tokenize(keywords)
    text_no_sw = []

    # TODO: Remove stopwords
    stop_words = set(stopwords.words('english'))

    for word in tokened_text:
        if word not in stop_words:
            text_no_sw.append(word)
    # print('before',text_no_sw)
    # TODO: Stem words
    stemmer = SnowballStemmer('english')
    tokens_stemmed = [stemmer.stem(x) for x in text_no_sw]
    frequency_dict = {}
    for keyword in tokens_stemmed:
        if not keyword in set(frequency_dict.keys()):
            frequency_dict[keyword] = 0
        frequency_dict[keyword] += 1
    processed_raw_data[date] = frequency_dict

print('------------- processed keywords/frequencies (news.db) -------------')

##################################################################

# Insert entries into tables
for date in processed_raw_data:
    frequency_dict = processed_raw_data[date]
Ejemplo n.º 36
0
import os
import time
import spacy
import platform
import functools
import KeyExt.config
from keybert import KeyBERT
from string import punctuation
from nltk.stem import SnowballStemmer
from stempel import StempelStemmer

# Initialize all required stemmers once.
stemmers = {
    'english': SnowballStemmer('english'),
    'french': SnowballStemmer('french'),
    'spanish': SnowballStemmer('spanish'),
    'portuguese': SnowballStemmer('portuguese'),
    'polish': StempelStemmer.default()
}


def load_models():
    """
    Function which loads the english NLP model, and the Keybert model.
    This needs to run once since all models need a few seconds to load.
    """
    return (spacy.load('en_core_web_sm'),
            KeyBERT('distiluse-base-multilingual-cased-v2'))


def preprocess(lis, language):
def clean_text(text,
               remove_stopwords=False,
               stem_words=False,
               count_null_words=True,
               clean_wiki_tokens=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    # dirty words
    text = text.lower()
    text = re.sub(
        r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",
        "", text)
    text = re.sub(
        r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}",
        "", text)

    if clean_wiki_tokens:
        # Drop the image
        text = re.sub(r"image:[a-zA-Z0-9]*\.jpg", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*\.png", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*\.gif", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*\.bmp", " ", text)

        # Drop css
        text = re.sub(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " ", text)
        text = re.sub(r"\{\|[^\}]*\|\}", " ", text)

        # Clean templates
        text = re.sub(r"\[?\[user:.*\]", " ", text)
        text = re.sub(r"\[?\[user:.*\|", " ", text)
        text = re.sub(r"\[?\[wikipedia:.*\]", " ", text)
        text = re.sub(r"\[?\[wikipedia:.*\|", " ", text)
        text = re.sub(r"\[?\[special:.*\]", " ", text)
        text = re.sub(r"\[?\[special:.*\|", " ", text)
        text = re.sub(r"\[?\[category:.*\]", " ", text)
        text = re.sub(r"\[?\[category:.*\|", " ", text)

    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"\!", " ! ", text)
    text = re.sub(r"\"", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub(' ', text)
    text = special_character_removal.sub('', text)

    if count_null_words:
        text = text.split()
        for t in text:
            word_count_dict[t] += 1
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    return (text)
Ejemplo n.º 38
0
def text_to_wordlist(text, remove_stop_words=True, stem_words=False):
    # Clean the text, with the option to remove stop_words and to stem words.
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", " 911 ", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", " direct messages ", text)
    text = re.sub(r"demonitization", "demonetization", text)
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text)
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text)
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text)
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)

    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])

    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        #text = [w for w in text if not w in ['a', 'an', 'the']]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return (text)
Ejemplo n.º 39
0
    print(w)

#Porter Stemmer
pStemmer = PorterStemmer()
print("Porter steeming output \n")
for p in wordtokens:
    print(pStemmer.stem(str(p)))

#lancasters Stemmer
lStemmer = LancasterStemmer()
print(" Lancaster stemming output\n")
for t in wordtokens:
    print(lStemmer.stem(str(t)))

#Snowball Stemmer
sStemmer = SnowballStemmer('english')
print("Snowball steeming oupput \n")
for s in wordtokens:
    print(sStemmer.stem(str(s)))

#parts of speech
print("Parts of Speech \n")
print(nltk.pos_tag(wordtokens))

#Lemmatizer
print("Lemmatizer \n")
lemmatizer = WordNetLemmatizer()
for l in wordtokens:
    print(lemmatizer.lemmatize(str(l)))

#Trigram
Ejemplo n.º 40
0
from utilities.db_manager import DBManager

# Import Packages for NLP
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
# import nltk.stem as stemmer
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

stemmer = SnowballStemmer('english')

# DWH = os.getenv('MIMIC_DWH')
# engine = create_engine(DWH)

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Currently set limit to 200,000 for just testing purposes. Will want to remove later.
QUERY = """
select
  subject_id,
  hadm_id,
  chartdate,
  text
Ejemplo n.º 41
0
def lemmatize_stemming(text):
	stemmer = SnowballStemmer("english")
	#text = text.decode('utf-8')
	return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
Ejemplo n.º 42
0
 def my_clean(text,stops = False,stemming = False,minLength = 2):
     text = str(text)
     text = re.sub(r" US ", " u s ", text)
     text = text.lower().split()
     if stemming and stops:
         text = [word for word in text if word not in stopwords.words('english')]
         wordnet_lemmatizer = WordNetLemmatizer()
         englishStemmer = SnowballStemmer("english", ignore_stopwords=False)
         text = [englishStemmer.stem(word) for word in text]
         text = [wordnet_lemmatizer.lemmatize(word) for word in text]
         text = [word for word in text if word not in stopwords.words('english')]
     elif stops:
         text = [word for word in text if word not in stopwords.words('english')]
     elif stemming:
         wordnet_lemmatizer = WordNetLemmatizer()
         englishStemmer = SnowballStemmer("english", ignore_stopwords=False)
         text = [englishStemmer.stem(word) for word in text]
         text = [wordnet_lemmatizer.lemmatize(word) for word in text]
     text = " ".join(text)
     text = re.sub(r"what's", "what is ", text)
     text = re.sub(r"don't", "do not ", text)
     text = re.sub(r"aren't", "are not ", text)
     text = re.sub(r"isn't", "is not ", text)
     text = re.sub(r"%", " percent ", text)
     text = re.sub(r"that's", "that is ", text)
     text = re.sub(r"doesn't", "does not ", text)
     text = re.sub(r"he's", "he is ", text)
     text = re.sub(r"she's", "she is ", text)
     text = re.sub(r"it's", "it is ", text)
     text = re.sub(r"\'s", " ", text)
     text = re.sub(r"\'ve", " have ", text)
     text = re.sub(r"n't", " not ", text)
     text = re.sub(r"i'm", "i am ", text)
     text = re.sub(r"\'re", " are ", text)
     text = re.sub(r"\'d", " would ", text)
     text = re.sub(r"\'ll", " will ", text)
     text = re.sub(r" e - mail ", " email ", text)
     text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
     text = re.sub(r",", " ", text)
     text = re.sub(r"\.", " ", text)
     text = re.sub(r"!", " ", text)
     text = re.sub(r";", " ", text)
     text = re.sub(r"\/", " ", text)
     text = re.sub(r"\^", " ", text)
     text = re.sub(r"\+", " ", text)
     text = re.sub(r"\-", " ", text)
     text = re.sub(r"\=", " ", text)
     text = re.sub(r"'", " ", text)
     text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
     text = re.sub(r":", " : ", text)
     text = re.sub(r" e g ", " eg ", text)
     text = re.sub(r" b g ", " bg ", text)
     text = re.sub(r" u s ", " american ", text)
     text = re.sub(r"\0s", "0", text)
     text = re.sub(r" 9 11 ", "911", text)
     text = re.sub(r" j k ", " jk ", text)
     text = re.sub(r"\s{2,}", " ", text)
     text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text) #Removes every number
     text = text.lower().split()
     text = [w for w in text if len(w) >= minLength]
     if stemming and stops:
         text = [word for word in text if word not in stopwords.words('english')]
         wordnet_lemmatizer = WordNetLemmatizer()
         englishStemmer = SnowballStemmer("english", ignore_stopwords=False)
         text = [englishStemmer.stem(word) for word in text]
         text = [wordnet_lemmatizer.lemmatize(word) for word in text]
         text = [word for word in text if word not in stopwords.words('english')]
     elif stops:
         text = [word for word in text if word not in stopwords.words('english')]
     elif stemming:
         wordnet_lemmatizer = WordNetLemmatizer()
         englishStemmer = SnowballStemmer("english", ignore_stopwords=False)
         text = [englishStemmer.stem(word) for word in text]
         text = [wordnet_lemmatizer.lemmatize(word) for word in text]
     text = " ".join(text)
     return text
Ejemplo n.º 43
0
from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from textblob import TextBlob
from langdetect import detect_langs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
import statistics

spanishStemmer = SnowballStemmer("spanish", ignore_stopwords=True)
exclude = set(string.punctuation)
WORD2VECMODEL = "/Users/frandm/Documents/Tesis/Code/SBW-vectors-300-min5.bin.gz"

THRESHOLD = 15
VOCAB_SIZE = 5000

stop_words = set(stopwords.words('spanish'))


def dblite_connect(dbname):

    conn = sqlite3.connect(dbname)
    return conn.cursor(), conn

Ejemplo n.º 44
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model')
    parser.add_argument('--data', type=str, default='../data/',
                        help='location of the data corpus')
    parser.add_argument('--presaved', action='store_true',
                        help='use presaved data')
    parser.add_argument('--glovedata', type=str, default='../data/glove.6B',
                        help='location of the pretrained glove embeddings')
    parser.add_argument('--din', type=int, default=30,
                        help='length of LSTM')
    parser.add_argument('--demb', type=int, default=300,
                        help='size of word embeddings')
    parser.add_argument('--dhid', type=int, default=300,
                        help='number of hidden units per layer')
    parser.add_argument('--dlin', type=int, default=500,
                        help='number linear transformation nodes')
    parser.add_argument('--dout', type=int, default=2,
                        help='number of output classes')
    parser.add_argument('--nlayers', type=int, default=1,
                        help='number of layers')
    parser.add_argument('--lr', type=float, default=0.001,
                        help='initial learning rate')
    parser.add_argument('--wd', type=float, default=0.0,
                        help='adam l2 weight decay')
    parser.add_argument('--clip', type=float, default=0.25,
                        help='gradient clipping')
    parser.add_argument('--embinit', type=str, default='random',
                        help='embedding weight initialization type')
    parser.add_argument('--decinit', type=str, default='random',
                        help='decoder weight initialization type')
    parser.add_argument('--hidinit', type=str, default='random',
                        help='recurrent hidden weight initialization type')
    parser.add_argument('--dropout', type=float, default=0.0,
                        help='dropout applied to layers (0 = no dropout)')
    parser.add_argument('--rnn', type=str, default='lstm',
                        help='lstm or gru')
    parser.add_argument('--epochs', type=int, default=40,
                        help='upper epoch limit')
    parser.add_argument('--batchsize', type=int, default=20, metavar='N',
                        help='batch size')
    parser.add_argument('--seed', type=int, default=3,
                        help='random seed')
    parser.add_argument('--vocabsize', type=int, default=200000,
                        help='random seed')
    parser.add_argument('--optimizer', action='store_true',
                        help='use ADAM optimizer')
    parser.add_argument('--pipeline', action='store_true',
                        help='use pipeline file')
    parser.add_argument('--psw', type=int, default=1,
                        help='remove stop words')
    parser.add_argument('--ppunc', action='store_true',
                        help='remove punctuation')
    parser.add_argument('--pntok', action='store_true',
                        help='use number tokens')
    parser.add_argument('--pkq', action='store_true',
                        help='keep question words')
    parser.add_argument('--stem', action='store_true',
                        help='use stemmer')
    parser.add_argument('--lemma', action='store_true',
                        help='use lemmatizer')
    parser.add_argument('--bidir', action='store_false',
                        help='bidirectional')
    parser.add_argument('--freezeemb', action='store_false',
                        help='freezes embeddings')
    parser.add_argument('--cuda', action='store_true',
                        help='use CUDA')
    parser.add_argument('--loginterval', type=int, default=100, metavar='N',
                        help='report interval')
    parser.add_argument('--save', type=str,  default='',
                        help='path to save the final model')
    args = parser.parse_args()

    

    if not args.presaved:
        pipe = None
        if args.pipeline:
            stemmer, lemmatizer = None, None
            if args.stem:
                stemmer = SnowballStemmer('english')
            elif args.lemma:
                lemmatizer = WordNetLemmatizer()

            pipe = functools.partial(pipeline, 
                                    rm_stop_words=args.psw, 
                                    rm_punc=args.ppunc, 
                                    number_token=args.pntok, 
                                    keep_questions=args.pkq,
                                    stemmer=stemmer,
                                    lemmatizer=lemmatizer)

        corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe)
        print('Loading Data')
        # train_data = pd.read_csv(args.data)
        #Shuffle order of training data

        train_data = pd.read_csv('../data/train_data_shuffle.csv')
        val_data = pd.read_csv('../data/val_data_shuffle.csv')

        print('Cleaning and Tokenizing')
        q1, q2, y = clean_and_tokenize(train_data, corpus)
        q1_val, q2_val, y_val = clean_and_tokenize(val_data, corpus)

        train_feat = list(map(feature_gen, zip(q1, q2)))
        val_feat = list(map(feature_gen, zip(q1_val, q2_val)))
        scalar = preprocessing.StandardScaler()
        train_feat = scalar.fit_transform(train_feat)
        val_feat = scalar.transform(val_feat)

        print('Piping Data')
        q1 = corpus.pipe_data(q1)
        q2 = corpus.pipe_data(q2)
        q1_val = corpus.pipe_data(q1_val)
        q2_val = corpus.pipe_data(q2_val)

        corpus.gen_vocab(q1 + q2 + q2_val + q1_val)

        n_feat = train_feat.shape[1]
        d_in = args.din
        feat_max = int(np.max([n_feat, d_in]))

        X = torch.Tensor(len(train_data), 1, 3, feat_max)
        X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long()
        X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long()
        X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat))
        y = torch.from_numpy(np.array(y)).long()

        X_val = torch.Tensor(len(val_data), 1, 3, feat_max)
        X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long()
        X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long()
        X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat))
        y_val = torch.from_numpy(np.array(y_val)).long()

        torch.save(X, '../data/X_feat.t')
        torch.save(y, '../data/y_feat.t')
        torch.save(X_val, '../data/X_val_feat.t')
        torch.save(y_val, '../data/y_val_feat.t')
        with open(args.save + '_corpus_feat.pkl', 'wb') as corp_f:
            pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL)

    else:
        n_feat = 22
        d_in = args.din
        print('Loading Presaved Data')
        X = torch.load(args.data + 'X_feat.t')
        y = torch.load(args.data + 'y_feat.t')
        X_val = torch.load(args.data + 'X_val_feat.t')
        y_val = torch.load(args.data + 'y_val_feat.t')
        with open('../data/corpus_feat.pkl', 'rb') as f:
            corpus = pkl.load(f)


    if args.cuda:
        X, y = X.cuda(), y.cuda()
        X_val, y_val = X_val.cuda(), y_val.cuda()

    print('Generating Data Loaders')
    #X.size len(train_data),1,2,fix_length
    train_dataset = TensorDataset(X, y)
    train_loader = DataLoader(train_dataset, 
                                batch_size=args.batchsize, 
                                shuffle=True)
    valid_loader = DataLoader(TensorDataset(X_val, y_val),
                                batch_size=args.batchsize,
                                shuffle=False)

    ntokens = len(corpus)
    glove_embeddings = None
    if args.embinit == 'glove':
        assert args.demb in (50, 100, 200, 300)
        glove_embeddings = get_glove_embeddings(args.glovedata, corpus.dictionary.word2idx, ntokens, args.demb)

    model = ConvRNNFeat(args.din, args.dhid, args.dout, args.demb, args.dlin, args.vocabsize, 
                        args.dropout, args.embinit, args.hidinit, args.decinit, 
                        glove_embeddings, args.cuda, args.rnn, args.bidir, n_feat)

    if args.cuda:
        model.cuda()

    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)

    model_config = '\t'.join([str(x) for x in (torch.__version__, args.rnn, args.bidir, args.clip, args.nlayers, args.din, args.demb, args.dhid, args.dlin,
                        args.embinit, args.decinit, args.hidinit, args.dropout, args.optimizer, args.lr, args.wd, args.vocabsize,
                        args.pipeline, args.psw, args.ppunc, args.pntok, args.pkq, args.stem, args.lemma)])

    print('Pytorch | RNN  | BiDir | Clip | #Layers | InSize | EmbDim | HiddenDim | LinearDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer| LR | WeightDecay | VocabSize | pipeline | stop | punc | ntoken | keep_ques | stem | lemma')
    print(model_config)

    # best_val_acc = 0.78
    best_ll = 0.5
    for epoch in range(args.epochs):
        model.train()
        total_cost = 0
        start_time = time.time()
        cur_loss = 0
        for ind, (qs, duplicate) in enumerate(train_loader):
            model.zero_grad()
            pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat])
            if args.cuda:
                pred = pred.cuda()
                duplicate = duplicate.cuda()
            duplicate = Variable(duplicate)
            loss = criterion(pred, duplicate)
            loss.backward()
            clip_grad_norm(model.parameters(), args.clip)

            if optimizer:
                optimizer.step()
            else:
                for p in model.parameters():
                    p.data.add_(-args.lr, p.grad.data)

            total_cost += loss.data[0]
            cur_loss += loss.data[0]

            if ind % args.loginterval == 0 and ind > 0:
                cur_loss = loss.data[0] / args.loginterval
                elapsed = time.time() - start_time
                print('| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | '
                        'Loss {:.6f}'.format(
                            epoch, ind, len(X) // args.batchsize,
                            elapsed * 1000.0 / args.loginterval, cur_loss))
                start_time = time.time()
                cur_loss = 0

        model.eval()
        train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in, n_feat)
        val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in, n_feat)
        # if args.save and (val_acc > best_val_acc):
        if args.save and (val_ll < best_ll):
            with open(args.save + '_corpus.pkl', 'wb') as corp_f:
                pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL)
            torch.save(model.cpu(), args.save)
            torch.save(model.cpu().state_dict(), args.save + ".state_dict")
            with open(args.save + ".state_dict.config", "w") as f:
                f.write(model_config)
            best_ll = val_ll
            if args.cuda:
                model.cuda()


        print('Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}'.format(
            epoch, total_cost, train_acc, val_acc, train_ll, val_ll))
        print('-' * 89)
Ejemplo n.º 45
0
if __name__ == "__main__":

    train_df = pd.read_csv('data/training_tweets_es.txt', sep='\t', header=0)
    classlabels = pd.read_csv('data/training_truth_es.txt', header=0)

    raw_docs_train = train_df['phrase'].values
    classlabels1 = classlabels['label'].values
    #print(classlabels1.shape)
    #print(len(raw_docs_train))
    #print(raw_docs_train[4318])

    stop_words = set(stopwords.words('dutch'))
    stop_words.update(
        ['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
    stemmer = SnowballStemmer('dutch')

    print "pre-processing train docs..."
    processed_docs_train = []
    for doc in raw_docs_train:
        doc = doc.decode("utf8")
        tokens = word_tokenize(doc)
        filtered = [word for word in tokens if word not in stop_words]
        stemmed = [stemmer.stem(word) for word in filtered]
        processed_docs_train.append(stemmed)

    processed_docs_all = processed_docs_train

    dictionary = corpora.Dictionary(processed_docs_all)
    dictionary_size = len(dictionary.keys())
    print "dictionary size: ", dictionary_size
Ejemplo n.º 46
0
    # print(cleaned)

    # stemmer = SnowballStemmer("english")
    # print(stemmer.stem("hygenist"))
    # print(" ".join([stemmer.stem(word) for word in cleaned.split()]))

    # print(review)
    # v.parse_words()
    # v.save_vocabulary()

    # t = Tokenizer("tokenizer", "vocabulary.txt")
    # print(len(t.word2Index))

    # print(clean_sentence("I really enjoyed my stay at this hotel on 5/12/2020!! This is jibberish abcd but here is this cool website: https://www.berkeley.edu"))
    sent = "I really enjoyed my stay at this hotel on five one two two zero two zero ! !  I am sure we will come again! Do not mind this jibberish abcd but here is this cool website :"
    stemmer = SnowballStemmer("english")
    cleaned_review = " ".join([stemmer.stem(word) for word in sent.split()])
    tokenized_review = tokenizer.word_tokenizer(cleaned_review)
    print(tokenized_review)
    # bpe = ByteBPETokenizer("yelp_bpe/yelp-bpe-vocab.json", "yelp_bpe/yelp-bpe-merges.txt")
    # enc = bpe.encode("I really enjoyed my stay at this hotel on 5/12/2020!! This is jibberish abcd but here is this cool website: https://www.berkeley.edu")
    # print(enc.tokens)
    # bpe.trainBPE(paths=["cleaned_reviews.txt"], vocab_size=25000)
    






    
Ejemplo n.º 47
0
tknzr = TweetTokenizer()
#make list of tokens instead of list of tweets
tokenizedTweets = [tknzr.tokenize(i) for i in tweets]
print("Tokenized Tweets: ", tokenizedTweets)

#remove stop words (to get these stopwords uncomment the following lines (only has to be run once)
#import nltk
#nltk.download("stopwords")
stopwords = stopwords.words('english')
#remove every word from tokenized tweets which is in stopwords (keep the rest)
filteredTweets = [[word for word in tweet if word not in stopwords]
                  for tweet in tokenizedTweets]
print("Filtered Tweets: ", filteredTweets)

#Stemming
st = SnowballStemmer("english")
stemmedTweets = [[st.stem(word) for word in tweet if word not in stopwords]
                 for tweet in tokenizedTweets]
print("Stemmed Tweets: ", stemmedTweets)

flatstemmedTweets = [" ".join(tweets) for tweets in stemmedTweets]
print("Stemmed Tweets flattened: ", flatstemmedTweets)

#PART 2: Noise Removal

#TF-IDF representation
vectorizer = TfidfVectorizer(min_df=1)
tweetsTF = vectorizer.fit_transform(flatstemmedTweets)

#DBScan
#function using DBSCAN function from scikit-learn
Ejemplo n.º 48
0
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
Ejemplo n.º 49
0
class TextSlack(BaseEstimator, TransformerMixin):
    def __init__(self, variety='BrE', user_abbrevs={}, lang='english'):
        try:
            self.variety = variety
            self.user_abbrevs = user_abbrevs
            self.lang = lang
            if self.lang in stopwords.fileids(
            ) and self.lang in SnowballStemmer.languages:
                self.stop_words = stopwords.words(lang)
            else:
                raise LanguageNotFoundException(
                    '{} is currently not supported by textslack.'.format(
                        self.lang),
                    'Keep checking for support in the future updates.')
            self.lemmatizer = WordNetLemmatizer()
            self.stemmer = SnowballStemmer(lang, ignore_stopwords=True)

        except LanguageNotFoundException as e:
            print(str(e))
            print('Details: {}'.format(e.details))

    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        if isinstance(X, pd.Series):
            return X.apply(self._preprocess_text)
        elif isinstance(X, list):
            return [self._preprocess_text(x) for x in X]
        else:
            return self._preprocess_text(X)

    def _preprocess_text(self, text):
        if self.lang == 'english':
            normalised_text = self._normalise(text)
            normalised_text = re.sub(' +', ' ', normalised_text)
            words = regexp_tokenize(normalised_text.lower(), r'[A-Za-z]+')
            removed_punct = self._remove_punct(words)
            removed_stopwords = self._remove_stopwords(removed_punct)
            return self._lemmatize(removed_stopwords)
        else:
            words = word_tokenize(text.lower())
            removed_punct = self._remove_punct(words)
            removed_stopwords = self._remove_stopwords(removed_punct)
            return ' '.join([w for w in removed_stopwords])

    def _normalise(self, text):
        try:
            return ' '.join(
                normalise(word_tokenize(text),
                          variety=self.variety,
                          user_abbrevs=self.user_abbrevs,
                          verbose=False))
        except:
            return text

    def _remove_punct(self, words):
        return [w for w in words if w not in string.punctuation]

    def _remove_stopwords(self, words):
        return [w for w in words if w not in self.stop_words and len(w) > 1]

    def _lemmatize(self, words):
        return ' '.join([self.lemmatizer.lemmatize(w, pos='v') for w in words])

    def _stem(self, words):
        return ' '.join([self.stemmer.stem(w) for w in words])

    def extract_nouns(self, text):
        try:
            if self.lang == 'english':
                processed_text = self._preprocess_text(text)
                pos_tags, _ = self._blob_features(processed_text)
                return ' '.join([w for w, p in pos_tags if p == 'NN'])
            else:
                raise LanguageNotFoundException(
                    'Sorry for the inconvenience, textslack is still learning {}.'
                    .format(self.lang),
                    'Keep checking for support in the future updates.')
        except LanguageNotFoundException as e:
            print(str(e))
            print('Details: {}'.format(e.details))

    def extract_verbs(self, text):
        try:
            if self.lang == 'english':
                processed_text = self._preprocess_text(text)
                pos_tags, _ = self._blob_features(processed_text)
                return ' '.join([w for w, p in pos_tags if p == 'VB'])
            else:
                raise LanguageNotFoundException(
                    'Sorry for the inconvenience, textslack is still learning {}.'
                    .format(self.lang),
                    'Keep checking for support in the future updates.')
        except LanguageNotFoundException as e:
            print(str(e))
            print('Details: {}'.format(e.details))

    def extract_adjectives(self, text):
        try:
            if self.lang == 'english':
                processed_text = self._preprocess_text(text)
                pos_tags, _ = self._blob_features(processed_text)
                return ' '.join([w for w, p in pos_tags if p == 'JJ'])
            else:
                raise LanguageNotFoundException(
                    'Sorry for the inconvenience, textslack is still learning {}.'
                    .format(self.lang),
                    'Keep checking for support in the future updates.')
        except LanguageNotFoundException as e:
            print(str(e))
            print('Details: {}'.format(e.details))

    def extract_adverbs(self, text):
        try:
            if self.lang == 'english':
                processed_text = self._preprocess_text(text)
                pos_tags, _ = self._blob_features(processed_text)
                return ' '.join([w for w, p in pos_tags if p == 'RB'])
            else:
                raise LanguageNotFoundException(
                    'Sorry for the inconvenience, textslack is still learning {}.'
                    .format(self.lang),
                    'Keep checking for support in the future updates.')
        except LanguageNotFoundException as e:
            print(str(e))
            print('Details: {}'.format(e.details))

    def sentiment(self, text):
        try:
            if self.lang == 'english':
                processed_text = self._preprocess_text(text)
                _, polarity = self._blob_features(processed_text)
                return 'pos' if polarity > 0.0 else 'neg' if polarity < 0.0 else 'neu'
            else:
                raise LanguageNotFoundException(
                    'Sorry for the inconvenience, textslack is still learning {}.'
                    .format(self.lang),
                    'Keep checking for support in the future updates.')
        except LanguageNotFoundException as e:
            print(str(e))
            print('Details: {}'.format(e.details))

    def _blob_features(self, text):
        blob = TextBlob(text)
        return blob.tags, blob.polarity

    def word_occurances(self, word, text):
        word_count_dic = dict(Counter([w for w in word_tokenize(text)]))
        return [c for w, c in word_count_dic.items() if w == word][0]
def stemmization(text, stemmer=SnowballStemmer('russian')):
    stem = [stemmer.stem(w) for w in remove_punctuation(text).split()]
    return ' '.join(stem)
Ejemplo n.º 51
0
 def __init__(self):
     self.bl_tokenizer = LineTokenizer()
     self.re_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
     self.stemmer = SnowballStemmer('english')
     self.NGRAM_RANGE = 3
Ejemplo n.º 52
0
import os
import re
import nltk
import argparse

import time

import torch
from torch.autograd import Variable
import torch.nn as nn
from models import NLINet
from mutils import get_optimizer
from data import get_nli, get_batch

from nltk.stem import SnowballStemmer
snowball_stemmer1 = SnowballStemmer('spanish')
snowball_stemmer2 = SnowballStemmer('english')
snowball_stemmer1.stem
snowball_stemmer2.stem

# from nltk.corpus import stopwords
# stops1 = set(stopwords.words("spanish"))
# stops2 = set(stopwords.words("english"))



#################### READ DATA ####################

df_train_en_sp = pd.read_csv('./input/cikm_english_train_20180516.txt',sep='	', header=None,error_bad_lines=False)
df_train_sp_en = pd.read_csv('./input/cikm_spanish_train_20180516.txt',sep='	', header=None,error_bad_lines=False)
df_train_en_sp.columns = ['english1', 'spanish1', 'english2', 'spanish2', 'result']
Ejemplo n.º 53
0
plt.figure(figsize=(20, 5))
lemmatized_words_freqdist.plot(len(lemmatized_words_freqdist.most_common(50)))
plt.show()

# Convert Lemmatized Words FreqDist Object into a Dictionary

lemmatized_words_freq_dict = dict(lemmatized_words_freqdist)
print(len(lemmatized_words_freq_dict))
print(lemmatized_words_freq_dict)

#Further Cleaning Using NLTK- Stem (From Word List after removing Stopwords)

# Snowball Stemmer
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
words_stemmed = [stemmer.stem(word) for word in words_no_stopwords]
print(len(words_stemmed))
print(words_stemmed[:100])

#Calculate the Frequency of Words

stemmed_words_freqdist = nltk.FreqDist(words_stemmed)
print(len(stemmed_words_freqdist))
print(stemmed_words_freqdist.items())

print(stemmed_words_freqdist.most_common(50))

import matplotlib.pyplot as plt
plt.figure(figsize=(20, 5))
stemmed_words_freqdist.plot(len(stemmed_words_freqdist.most_common(50)))
tokens = convertStemms(tokensStop)
print(text[:100])

vocabulary = getVocabulary(text)

positions = initializeContext(tokens, vocabulary) #Initialize Context

#Get contexts
contexts = {}
print("Context:")
for term in vocabulary:
	contexts[term] = getContext(term, positions, 4, tokens)
	
#Get frecuency, vectors = {}
vectors = getFrecuency(vocabulary, contexts)

word = "grande"
ss = SnowballStemmer("spanish")
stemWord = ss.stem(word)

similitud = {}
similitud = getSimilitud(vocabulary, vectors, stemWord)

l = list()
for key, val in similitud.items():
	l.append((val, key))
l.sort(reverse = True)
print(l[:10])

createFileDic(nameFile, l)
Ejemplo n.º 55
0
import nltk

from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

with open('trial2.txt', 'r') as myfile:
    data2=myfile.read().replace('\n', ' ')

sp=set(stopwords.words("english"))
variable = nltk.word_tokenize(data2)
sp2 = [stemmer.stem(w) for w in sp]
variable2 = [stemmer.stem(w) for w in variable]
filtered_sentence = [w for w in variable2 if not w in sp2]
a=" ".join(filtered_sentence)
sent = nltk.sent_tokenize(a)
b="\n".join(sent)
open('naya.txt','w').write(b)
def convertStemms(tokens):
	ss = SnowballStemmer("spanish")
	text = []
	for t in tokens:
		text.append(ss.stem(t))
	return text
Ejemplo n.º 57
0
from gensim import corpora, models, similarities
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.parsing import preprocessing
from gensim.utils import simple_preprocess
import contractions
import os
from tqdm import tqdm
from pprint import pprint
import pickle
import spacy
#from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language='english')

# disable parse, named entity recognition to speed it up.
nlp = spacy.load('en', disable=['parser', 'ner'])

# Add new stop words: https://stackoverflow.com/questions/41170726/add-remove-stop-words-with-spacy
# |= : syntactic sugar for creating doing a union with the  with {}
nlp.Defaults.stop_words |= {
    'table', 'ref', 'formula', 'citation', 'cit', 'references'
    'fig', 'figure', 'abstract', 'introduction', 'description', 'conclusion',
    'results', 'discussion'
}

# Load the Mallet LDA Java  program
mallet_path = '/home/ashwath/mallet-2.0.8/bin/mallet'
'''
Ejemplo n.º 58
0
nltk.download('wordnet')

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

print(len(documents))
print(documents[:5])

np.random.seed(2018)

print(WordNetLemmatizer().lemmatize('went', pos='v'))

### Performing stem operation
stemmer = SnowballStemmer('english')
original_words = [
    'caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned',
    'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization',
    'sensational', 'traditional', 'reference', 'colonizer', 'plotted'
]
singles = [stemmer.stem(plural) for plural in original_words]
chk = pd.DataFrame(data={'original word': original_words, 'stemmed': singles})
print(chk)


def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
Ejemplo n.º 59
0
            word_counts[word] = 1
        else:
            word_counts[word] += 1

print("Size of Vocabulary:", len(word_counts))

# Create a list of tuples of unique words and their corresponding frequencies
word_counts_list = [(key, value) for key, value in word_counts.items()]

# Sort the list of word_counts in descending order
word_counts_list = sorted(word_counts_list, key=lambda x: x[1], reverse=True)
# Print the top 100 words
print(word_counts_list[:100])

# Define a snowball stemmer object for English
stemmer = SnowballStemmer("english")

# Load GloVe's embeddings
embeddings_index = {}
with open(directory + folder + "glove.840B.300d.txt", encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = stemmer.stem(values[0])
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

import numpy as np
import json
Ejemplo n.º 60
0
        printall += "Fichero " + docIndex.get(d) + "\n"
        # Obtain article and print title
        article = re.split(delimiter_noticia, data)[p + 1]
        printall += re.split(delimiter_title, article)[1] + "\n"
        cont += 1
        if len(res) <= 2:
            # Print whole article
            text = re.split(delimiter_text, article)[1]
            printall += text + "\n"
        elif len(res) <= 5:
            # Print snippets
            toprint = snippet(re.split(delimiter_text, article)[1], wordlist)
            printall += toprint + "\n"

    # Print number of results and timing
    total_time = time.time() - start_time
    printall += "%d resultados obtenidos en %.9f segundos\n" % (len(res),
                                                                total_time)
    return printall


print("Loading mini_enero.data...")
# Retrieve data from file
with open("mini_enero.data", "rb") as f:
    (index, docIndex, titleIndex, catIndex, dateIndex, universe, stems,
     permuterm) = pickle.load(f)
    f.close()
print("Loaded!")
# Prepare variables
stemmer = SnowballStemmer('spanish')