def pos_tokenizer(s): #define a tokenizer that uses POS tagging
    texts=nltk.word_tokenize(s)

    texts=[word for word in texts if len(word)>2]

    # PULL OUT NOUN AND VERB PHRASES
    chunktext=nltk.pos_tag(texts)
    patterns="""
                VP:{<V.*><DT>?<JJ.*>?<NN.*>}
                NP:{<DT>?<JJ>*<NN.*>}
                N:{<NN.*>}
    """
    NPchunker=nltk.RegexpParser(patterns)

    from nltk.stem.snowball import SnowballStemmer
    st=SnowballStemmer('english')

    #print text
    temp=[]
    result=NPchunker.parse(chunktext)
    #print result
    for phrase in result:
        try:
            phrase.label()
            string=''
            m=0
            for word in phrase:
                if m==0:
                    string+=st.stem(word[0])
                    m+=1
                else: string+=' '+st.stem(word[0])
            temp.append(string)
        except: pass
    return temp
Esempio n. 2
0
    def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu'
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate translation hypotheses.')
    parser.add_argument('-i', '--input', default=baseline_path+'data/hyp1-hyp2-ref',
            help='input file (default data/hyp1-hyp2-ref)')
    parser.add_argument('-n', '--num_sentences', default=None, type=int,
            help='Number of hypothesis pairs to evaluate')
    # note that if x == [1, 2, 3], then x[:None] == x[:] == x (copy); no need for sys.maxint
    opts = parser.parse_args()

    # we create a generator and avoid loading all sentences into a list
    def sentences():
        with open(opts.input) as f:
            for pair in f:
                yield [sentence.strip().split() for sentence in pair.split(' ||| ')]

    english_stemmer = SnowballStemmer("english")

    # note: the -n option does not work in the original code
    for h1, h2, ref in islice(sentences(), opts.num_sentences):
        # Perform morphological stemming before calculating METEOR score
        h1 = [english_stemmer.stem(word) for word in h1]
        h2 = [english_stemmer.stem(word) for word in h2]
        ref = [english_stemmer.stem(word) for word in ref]

        rset = set(ref)
        h1_match = meteor(h1, rset)
        # print "meteor is h1_match ", h1_match
        h2_match = meteor(h2, rset)
        # print "meteor is h2_match ", h2_match
        print(1 if h1_match > h2_match else # \begin{cases}
                (0 if h1_match == h2_match
                    else -1)) # \end{cases}
Esempio n. 4
0
 def classify(self, sText):
    """Given a target string sText, this function returns the most likely document
    class to which the target string belongs (i.e., positive, negative or neutral).
    """
    tokens = self.tokenize(sText)
    posProbability, negProbability = 0, 0
    posNum, negNum = float(sum(self.pos_dic.values())), float(sum(self.neg_dic.values()))
    stemmer = SnowballStemmer("english")
    for i in range(len(tokens) - 1):
        if not isPunctuationMark(tokens[i]):
            unigram = stemmer.stem(tokens[i])
            second_word = stemmer.stem(tokens[i + 1])
            try:
                bigram = unigram + " " + second_word
            except UnicodeDecodeError:
                continue
            #adds one smoothing and takes log to avoid underflow
            posProbability += math.log(float((self.pos_dic.get(bigram, 0) + 1)) / posNum)
            posProbability += math.log(float((self.pos_dic.get(unigram, 0) + 1)) / posNum)
            negProbability += math.log(float((self.neg_dic.get(bigram, 0) + 1)) / negNum)
            negProbability += math.log(float((self.neg_dic.get(unigram, 0) + 1)) / negNum)
    if tokens:
        posProbability += math.log(float((self.pos_dic.get(tokens[-1], 0) + 1)) / posNum)
        negProbability += math.log(float((self.neg_dic.get(tokens[-1], 0) + 1)) / negNum)
    if posProbability > negProbability:
        return "positive"
    else:
        return "negative"
Esempio n. 5
0
class WordCount:
	def __init__(self, language):
		self.stopwords = self.load_stopwords(language)
		self.parse_regexp = re.compile(r"([0-9]*[\w][\w0-9]+)", re.UNICODE)
		self.current_stemmer = SnowballStemmer(language)

	@staticmethod
	def load_stopwords(language):
		stoplist = []
		if language == 'english':
			with codecs.open('geomedia'+ os.sep +'en_stoplist.txt', "r", "utf-8") as f:
				stoplist = [line.rstrip() for line in f]
		else:
			#download('stopwords')
			stoplist = stopwords.words(language)

		return stoplist

	def parse_text(self, text, wordcount_dictionary=None):
		"""
		>>> wordcount = WordCount() #doctest: +ELLIPSIS
		[nltk_data] ...
		>>> wordcount.parse_text("a1a ma kota")
		{'ma': 1, 'a1a': 1, 'kota': 1}
		>>> wordcount.parse_text("a1a ma kota", {'a1a': 2, 'kota': 1})
		{'ma': 1, 'a1a': 3, 'kota': 2}
		"""
		if wordcount_dictionary is None:
			wordcount_dictionary = {}
		words = self.parse_regexp.findall(text)
		for word in words:
			new_word = self.current_stemmer.stem(word.lower())
			if word not in self.stopwords and new_word not in self.stopwords:
				if new_word in wordcount_dictionary:
					wordcount_dictionary[new_word] += 1
				else:
					wordcount_dictionary[new_word] = 1
		return wordcount_dictionary
		
	def parse_text_extra(self, text, wordcount_dictionary=None, extras=None):
		if wordcount_dictionary is None:
			wordcount_dictionary = {}
		if wordcount_dictionary is None:
			extras = {}
		words = self.parse_regexp.findall(text)
		for word in words:
			new_word = self.current_stemmer.stem(word.lower())
			word = word.lower()
			if word not in self.stopwords and new_word not in self.stopwords:
				if new_word in wordcount_dictionary:
					wordcount_dictionary[new_word] += 1
					if word in extras[new_word]:
						extras[new_word][word] += 1
					else:
						extras[new_word][word] = 1
				else:
					wordcount_dictionary[new_word] = 1
					extras[new_word] = {}
					extras[new_word][word] = 1
Esempio n. 6
0
def stem_snowball(tokens):
    stemmer = SnowballStemmer("russian")

    if isinstance(tokens, basestring):
        return stemmer.stem(tokens)
    else:
        stemmed = [stemmer.stem(token) for token in tokens]
        return stemmed
Esempio n. 7
0
def stem(list):
    stemmer = SnowballStemmer('english')
    stemmed_tokens = []

    for x in list:
        stemmed_tokens.append(stemmer.stem(x))
        terms_dictionary.update_terms_dictionary(stemmer.stem(x), x) #creo il dizionario di token e termini originali
    return stemmed_tokens
Esempio n. 8
0
    def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen'
def extract_bigrams(articleList, commentCount):
    featureMatrix = np.zeros([commentCount,100])

    index = 0
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    bagOfWords = []
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bagOfWords += stemmed_words
            bagOfWords.append("\n")
            
    tempVector = dict()
        
    #Create your bigrams
    bgs = nltk.bigrams(bagOfWords)

    fdist = nltk.FreqDist(bgs)   
    
    for k in fdist.keys()[:100]:
        tempVector[k] = 0
    
    
    theKeys = tempVector.keys()
    
    for art in articleList.items():        
        for comm in art[1]:
            mywords = words(comm.body)
            mywords = known_words(mywords)
            # Remove Stops
            filtered_words = [w for w in mywords if not w in stopwords.words('english')]
            # Stemming
            stemmed_words = [stemmer.stem(w) for w in filtered_words]
            bgs = nltk.bigrams(stemmed_words)
            for word in (w for w in bgs if tempVector.has_key(w)):
                keyInd = theKeys.index(word)      
                featureMatrix[index][keyInd] += 1
                           
            index += 1
            if index % 100 == 0:
                print "extracted", index, "features"
        
            if index >= commentCount:
                break            
            
            
    
    
    print "non-zero",np.count_nonzero(featureMatrix)
    print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1]))
    return featureMatrix
Esempio n. 10
0
def get_unigram_feats(document):
	document_words = set(document.split())
	s = SnowballStemmer("english")
	stemmed_words = [ s.stem(word) for word in document_words ]
	features = {}
	#features['count'] = len(document_words)
	for word in data.wordlist:
		word = s.stem(word)
		features['contains({})'.format(word)] = (word in stemmed_words)
	return features
Esempio n. 11
0
def highestFrequency(quesWords,sentWords):
    stemmer = SnowballStemmer("english");
    match = 0
    nonMatch = 0
    for qw in quesWords:
        for aw in sentWords:
            if stemmer.stem(qw) == stemmer.stem(aw) :
                match += 1
            else:
                nonMatch += 1
    return (match)
Esempio n. 12
0
def jaccardDistance(quesWords,sentWords):
    stemmer = SnowballStemmer("english");
    match = 0
    nonMatch = 0
    for qw in quesWords:
        for aw in sentWords:
            if stemmer.stem(qw) == stemmer.stem(aw) :
                match += 1
            else:
                nonMatch += 1
    return (match)
def preProcessing(bitext):
    # transfer to lower case
    bitext = [[[x.lower() for x in sent ] for sent in bisent] for bisent in bitext]
    # stemmer
    e_stemmer = SnowballStemmer("german")
    f_stemmer = SnowballStemmer("english")
    for (n, (f,e)) in enumerate(bitext):
        for idx, f_i in enumerate(f):
            f[idx] = f_stemmer.stem(f_i)
        for idx, e_i in enumerate(e):
            e[idx] = e_stemmer.stem(e_i)
Esempio n. 14
0
def process_missing(missing, sec):
    st = SnowballStemmer('english')
    morphological_errors = 0
    for m in missing:
        ind = sec['incorrect'].index(m)
        prediction = sec['predicted'][ind]
        if(st.stem(m[3]) == st.stem(prediction[0])):
            morphological_errors += 1        
        print('the correct sequence is: '+str(m)+' but predicted: '+str(prediction))
    print('morphological errors:' + str(morphological_errors))
    if len(missing):
        print('percentage:' + str(morphological_errors/len(missing)))
Esempio n. 15
0
 def trigram(self,term):
   x,y,z =term
   stemmer=SnowballStemmer("english")
   x= stemmer.stem(x)
   y= stemmer.stem(y)
   z= stemmer.stem(z)
   label=x+y+z 
   new_column=[]
   for words_stem in self.stemwords:       
     if x in words_stem and y in words_stem and z in words_stem:
         new_column.append('True')
     else:
         new_column.append('False')
   self.dataframegenerator(new_column,label) 
Esempio n. 16
0
def imprimir_resto(clase, puesto, descrip, req):
    #lineaTotal = filter(lambda x: x in string.printable, lineaTotal)
    archEscritura.write(clase)
    archEscritura.write(",")
    blobPuesto = TextBlob(puesto.decode('utf-8'))
    blobDescrip = TextBlob(descrip.decode('utf-8'))
    blobReq = TextBlob(req.decode('utf8', 'ignore'))
    wordsPuesto = blobPuesto.words
    wordsDescrip = blobDescrip.words
    wordsReq = blobReq.words
    
    for wordP in wordsPuesto:
        nword = strip_accents(wordP)
        exclude = set(string.punctuation)
        nword = ''.join(ch for ch in nword if ch not in exclude)
        nword = nword.lower()
        nword = filter(lambda x: x in string.printable, nword)
        archEscritura.write(nword)
        archEscritura.write(" ")
    archEscritura.write(",")
    stemmer = SnowballStemmer("spanish")
    cad = ""
    for wordD in wordsDescrip:
        nwordD = strip_accents(wordD)
        exclude = set(string.punctuation)
        nwordD = ''.join(ch for ch in nwordD if ch not in exclude)
        nwordD = filter(lambda x: x in string.printable, nwordD)
        if nwordD not in (stopwords.words('spanish')):#Elimnimar Stop words
            w=Word(nwordD)        
        #comentarios.append(w)
            word2= stemmer.stem(w.lower())
            archEscritura.write(word2)
            archEscritura.write(" ")
    archEscritura.write(",")
    lista = []
    for wordP in wordsReq:
        nwordP = strip_accents(wordP)
        exclude = set(string.punctuation)
        nwordP = ''.join(ch for ch in nwordP if ch not in exclude)
        nwordP = filter(lambda x:x in string.printable, nwordP)
        if nwordP not in (stopwords.words('spanish')):
            w=Word(nwordP)
            word3 = stemmer.stem(w.lower())
            if word3 not in lista:
                lista.append(word3)
    for pal in lista:
        archEscritura.write(pal)
        archEscritura.write(" ")
    archEscritura.write("\n")
def clean_single_word(word, lemmatizing="wordnet"):
    """
    Performs stemming or lemmatizing on a single word.

    If we are to search for a word in a clean bag-of-words, we need to search it after the same kind of preprocessing.

    Inputs: - word: A string containing the source word.
            - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet".

    Output: - lemma: The resulting clean lemma or stem.
    """
    if lemmatizing == "porter":
        porter = PorterStemmer()
        lemma = porter.stem(word)
    elif lemmatizing == "snowball":
        snowball = SnowballStemmer('english')
        lemma = snowball.stem(word)
    elif lemmatizing == "wordnet":
        wordnet = WordNetLemmatizer()
        lemma = wordnet.lemmatize(word)
    else:
        print("Invalid lemmatizer argument.")
        raise RuntimeError

    return lemma
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """


    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        stemmer = SnowballStemmer("english")
        stemmed_words = []
        for word in text_string.split():
            stemmed_words.append(stemmer.stem(word.strip()))
        words = " ".join(stemmed_words)

    return words
class snowballStemmer:
	def __init__(self):
		self.stemmer = SnowballStemmer("english")

	def stem(self,keyword_score, keyword_idf_dir, keyword_tf_dir):
		stem_dict = {}
		stem_dict_score = {}
		print "in stem function.............."
		for key in keyword_score.iterkeys():
			root = self.stemmer.stem(key.keyword)
			if stem_dict.has_key(root):
				stem_dict[root]['words'].append(key)
				if key.is_title() or key.is_tag():
					stem_dict[root]['boost'] = 1
				if stem_dict[root]['tf'] < keyword_tf_dir[key]:
					stem_dict[root]['tf'] = keyword_tf_dir[key]
			else:
				stem_dict[root] = {}
				stem_dict[root]['boost'] = 0
				stem_dict[root]['words'] = []
				stem_dict[root]['words'].append(key)
				stem_dict[root]['idf'] = keyword_idf_dir[key]
				stem_dict[root]['tf'] = keyword_tf_dir[key]
				if key.is_title() or key.is_tag():
					stem_dict[root]['boost'] = 1

		for root in stem_dict.iterkeys():
			stem_dict_score[root] = stem_dict[root]['idf']*stem_dict[root]['tf']
	#	print stem_dict_score
		return stem_dict, stem_dict_score
Esempio n. 20
0
    def tokenize(self, document):
        """
        Break text into sentences and each sentence into a list of single words
        Ignore any token that falls into the stopwords set.
        """
        # use sentence tokenizer sent_tokenize from nltk package
        sentences = sent_tokenize(utils.to_unicode(document.lower()))

        # create stemmer of class SnowballStemmer
        stemmer = SnowballStemmer("english")

        for sentence in sentences:
            words = [word
                   for word in utils.tokenize(
                    self.cleanse_text(sentence)
                   )]

            if self.remove_stopwords:
                words = [ 
                         word for word in words 
                         if word not in self.en_stopwords
                        ]

            if self.stemming:
                words = [stemmer.stem(t) for t in words]

            yield words
Esempio n. 21
0
	def get_stemm_tags(self, tags):
		stemm_tags = []
		current_stemmer = SnowballStemmer('english')
		for tag in self.tags:
			stemm_tags.append(current_stemmer.stem(tag.lower()))
		
		return stemm_tags
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated)
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        """


    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""

    stemmer = SnowballStemmer("english")
    if len(content) > 1:
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
        
        split = text_string.split()  
        text = [stemmer.stem(word) for word in split]
        words = ' '.join(text)


    f.close()

    return words.strip()
Esempio n. 23
0
class Preprocessor(object):

    # 对各种操作进行初始化
    def __init__(self):
        # 创建正则表达式解析器
        self.tokenizer = RegexpTokenizer(r'\w+')

        # 获取停用词列表
        self.stop_words_english = stopwords.words('english')

        # 创建Snowball词干提取器
        self.stemmer = SnowballStemmer('english')

    # 标记解析、移除停用词、词干提取
    def process(self, input_text):
        # 标记解析
        tokens = self.tokenizer.tokenize(input_text.lower())

        #移除停用词
        tokens_stopwords = [x for x in tokens if not x in self.stop_words_english]

        # 词干提取
        tokens_stemmed = [self.stemmer.stem(x) for x in tokens_stopwords]

        # 返回处理后的标记
        return tokens_stemmed
Esempio n. 24
0
def text_cleaner_and_tokenizer(texts):
    """
    takes a list of sentences, removes punctuation, numbers, stopwords and stems.
    Then joins everything back together and returns the filtered texts as a list of unicode strings
    :param texts: list of unprocessed strings
    :return: list of unicode strings
    """
    i = 0
    stopword_list = set(stopwords.words('danish'))
    stemmer = SnowballStemmer("danish", ignore_stopwords=False)
    filtered_texts = []

    for sentence in texts:
        for symbol in punctuation:
            sentence = sentence.replace(symbol,'')
        for num in numbers:
            sentence = sentence.replace(str(num),'')
        sentence = sentence.decode('utf-8').lower()
        words_in_sentence = word_tokenize(sentence, language='danish')
        filtered_sentence = []
        for word in words_in_sentence:
            if word not in stopword_list:
                stem_word = stemmer.stem(word)
                filtered_sentence.append(stem_word)

        sentence = ' '.join(filtered_sentence)
        filtered_texts.append(sentence)

        i = i +1
        if i % 1000 == 0:
            print(i)
    print('Done :D!')
    return filtered_texts
def tokenize(string, stem=True, entire=False):
    """
    INPUT: string
    OUTPUT: a list of words
    """
    string = string.replace("/", " ")
    string = string.replace("-", " ")
    tokenizer = PottsTokenizer(preserve_case=False)
    token_list = tokenizer.tokenize(string)
    punctuation = re.compile(r'[-.?!,":;$/*()|0-9]') # remove these punctuations and number 
    token_list = [punctuation.sub("", word) for word in token_list]  
    token_list = filter(None, token_list) #filters empty   

    #filter out stopwords 
    STOPWORDS = set(nltk.corpus.stopwords.words('english'))
    STOPWORDS.update(('would','does','got',"doesn't","it's","isn't","don't","i'm","i'll","i've", "=","can't","didn't","etc","+","%","won't","that's","nikon","g","&", "sure", "may", "yet", "ok","haven't","else","maybe","wouldn't","couldn't","via","rt","'","you're","almost","v","there's","#",'well','somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere'))
    if entire:
        # if need a larger set
        stopwords_entire_list = loadEntireStopWord()
        STOPWORDS.update(set(stopwords_entire_list))
    token_list = [word for word in token_list if word not in STOPWORDS]

    #stemmer 
    if stem:
        stemmer = SnowballStemmer("english")
        token_stem_list = [stemmer.stem(token) for token in token_list]
        token_list = token_stem_list

    return token_list
def tokenize(s, stem=True, digit=False, stop=True, use_re=False):
    """
    :type s: str
    :type stem: bool
    :type use_re: bool
    :rtype: set(str)
    """
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')
    wordnet = WordNetLemmatizer()
    table = string.maketrans("","")

    if use_re:
        s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)

    if digit:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits)))
    else:
        tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation)))

    if stop:
        tokens = set(word for word in tokens if word not in stop_words)

    if stem:
        tokens = set(stemmer.stem(word) for word in tokens)

    return tokens
Esempio n. 27
0
 def __init__(self,df, column,n ): # gets the most frecuent words in a document
   
     texto = " ".join(str(x) for x in df[column].values)
     tokens = texto.split()
     tokens=[x.lower() for x in tokens]
     #stopset = set(stopwords.words('english')) # dictionary of stop words
     #tokens = [w for w in tokens if not w in stopset]
     stemmer=SnowballStemmer("english")
     stemm_words=[]
     tokens_clean=[]
     for j in tokens:
       
       sa=re.sub('[^A-Za-z]+', '', j)
       tokens_clean.append(sa)
     #print tokens_clean
     for s in tokens_clean:
       try:
         stem= stemmer.stem(s)
         if s!='':
          stemm_words.append(str(stem)) 
       except:
         pass
     cuenta = len(tokens_clean)
     largo =  Counter(stemm_words).most_common(n)
     topdic = dict(largo)
     asortado = Series(topdic)
     asortadol = asortado.columns = ['a', 'b']
     ordenado = asortado.order(ascending=False)
     ordenadolist= topdic.keys() #+stemm_words
     self.top=ordenadolist
def parseOutBody(f):
    from nltk.stem.snowball import SnowballStemmer
    import string
  
    

    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation).split()

        ### project part 2: comment out the line below
        #words = text_string

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        
        stemmer = SnowballStemmer('english')
        
        for word in text_string:
            word = word.strip()
            word = stemmer.stem(word)            
            words = words + ' ' + word
    else:
        pass


    return words
Esempio n. 29
0
def stem_stopword_clean( vett_strings ):
    '''
    Prende un vettore di studenti o lavori ongli elemento delle lista unico e stemmato.
    Divide elementi composti da piu parole, rimuove le STOPwords
    :param vett_value: vettore di stringhe
    :return: vettore di parole stem senza stopwords
    '''

    # importo libreria per stem
    from nltk.stem.snowball import SnowballStemmer
    from nltk.corpus import stopwords

    stemmer = SnowballStemmer("italian")

    stop = set(stopwords.words('italian'))

    # logger.error(stemmer.stem("italian"))
    # logger.error(stemmer.stem("a"))
    # logger.error(stemmer.stem("andate tutti a correre"))

    documents=[]

    # logger.error(stop)

    stem_parola=''

    for frasi in vett_strings:
        for parola in frasi.split(" "):
            stem_parola=stemmer.stem(parola)
            if(stem_parola not in stop and stem_parola not in documents):
                documents.append(stem_parola)


    return documents
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top, stem words
        and return a string that contains all the words
        in the email (space-separated)
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """
    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### split the text string into individual words, stemming each word,
        ### and appending the stemmed word to words
        words = text_string.strip().split()
        stemmer = SnowballStemmer("english")
        stemmed_text_string = ""
 
        for word in words:
            stemmed_text_string += stemmer.stem(word) + " "

    return stemmed_text_string.strip()
Esempio n. 31
0
class OntologyMatchingDatasetReader(DatasetReader):
    """
    Reads instances from a jsonlines file where each line is in the following format:
    {"match": X, "source": {kb_entity}, "target: {kb_entity}}
     X in [0, 1]
     kb_entity is a slightly modified KBEntity in json with fields:
        canonical_name
        aliases
        definition
        other_contexts
        relationships
    and converts it into a ``Dataset`` suitable for ontology matching.
    Parameters
    ----------
    token_delimiter: ``str``, optional (default=``None``)
        The text that separates each WORD-TAG pair from the next pair. If ``None``
        then the line will just be split on whitespace.
    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
        Note that the `output` tags will always correspond to single token IDs based on how they
        are pre-tokenised in the data file.
    """
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 name_token_indexers: Dict[str, TokenIndexer] = None,
                 token_only_indexer: Dict[str, TokenIndexer] = None) -> None:
        self._name_token_indexers = name_token_indexers or \
                                    {'tokens': SingleIdTokenIndexer(namespace="tokens"),
                                     'token_characters': TokenCharactersIndexer(namespace="token_characters")}
        self._token_only_indexer = token_only_indexer or \
                                   {'tokens': SingleIdTokenIndexer(namespace="tokens")}
        self._tokenizer = tokenizer or WordTokenizer()

        self._empty_token_text_field = TextField(
            self._tokenizer.tokenize('00000'), self._token_only_indexer)
        self._empty_list_token_text_field = ListField([
            TextField(self._tokenizer.tokenize('00000'),
                      self._token_only_indexer)
        ])

        self.PARENT_REL_LABELS = constants.UMLS_PARENT_REL_LABELS
        self.CHILD_REL_LABELS = constants.UMLS_CHILD_REL_LABELS

        self.STOP = set(stopwords.words('english'))
        self.tokenizer = RegexpTokenizer(r'[A-Za-z\d]+')
        self.stemmer = SnowballStemmer("english")
        self.lemmatizer = WordNetLemmatizer()

        self.nlp = spacy.load('en')

    @overrides
    def read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []

        # open data file and read lines
        with open(file_path, 'r') as ontm_file:
            logger.info(
                "Reading ontology matching instances from jsonl dataset at: %s",
                file_path)
            for line in tqdm.tqdm(ontm_file):
                training_pair = json.loads(line)
                s_ent = training_pair['source_ent']
                t_ent = training_pair['target_ent']
                label = training_pair['label']

                # convert entry to instance and append to instances
                instances.append(self.text_to_instance(s_ent, t_ent, label))

        if not instances:
            raise ConfigurationError(
                "No instances were read from the given filepath {}. "
                "Is the path correct?".format(file_path))
        return Dataset(instances)

    @staticmethod
    def _normalize_ent(ent):
        norm_ent = dict()
        norm_ent['canonical_name'] = string_utils.normalize_string(
            ent['canonical_name'])
        norm_ent['aliases'] = [
            string_utils.normalize_string(a) for a in ent['aliases']
        ]
        norm_ent['definition'] = string_utils.normalize_string(
            ent['definition'])
        norm_ent['par_relations'] = set(
            [string_utils.normalize_string(i) for i in ent['par_relations']])
        norm_ent['chd_relations'] = set(
            [string_utils.normalize_string(i) for i in ent['chd_relations']])
        return norm_ent

    def _compute_tokens(self, ent):
        """
        Compute tokens from given entity
        :param ent:
        :return:
        """
        name_tokens = string_utils.tokenize_string(ent['canonical_name'],
                                                   self.tokenizer, self.STOP)
        stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens])
        lemmatized_tokens = tuple(
            [self.lemmatizer.lemmatize(w) for w in name_tokens])
        character_tokens = tuple(
            string_utils.get_character_n_grams(ent['canonical_name'],
                                               constants.NGRAM_SIZE))

        alias_tokens = [
            string_utils.tokenize_string(a, self.tokenizer, self.STOP)
            for a in ent['aliases']
        ]

        def_tokens = string_utils.tokenize_string(ent['definition'],
                                                  self.tokenizer, self.STOP)

        return [
            name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens,
            alias_tokens, def_tokens
        ]

    def _dependency_parse(self, name):
        """
        compute dependency parse of name and return root word, and all chunk root words
        :param name: name string
        :return:
        """
        doc = self.nlp(name)
        root_text = [(token.dep_, token.head.text) for token in doc]
        root = [t for d, t in root_text if d == 'ROOT'][0]
        root_words = set([t for d, t in root_text])
        return root, root_words

    def _get_features(self, s_ent, t_ent):
        """
        compute all LR model features
        :param s_ent:
        :param t_ent:
        :return:
        """
        s_name_tokens, s_stem_tokens, s_lemm_tokens, s_char_tokens, s_alias_tokens, s_def_tokens = self._compute_tokens(
            s_ent)
        t_name_tokens, t_stem_tokens, t_lemm_tokens, t_char_tokens, t_alias_tokens, t_def_tokens = self._compute_tokens(
            t_ent)

        has_same_canonical_name = (s_name_tokens == t_name_tokens)
        has_same_stemmed_name = (s_stem_tokens == t_stem_tokens)
        has_same_lemmatized_name = (s_lemm_tokens == t_lemm_tokens)
        has_same_char_tokens = (s_char_tokens == t_char_tokens)
        has_alias_in_common = (len(
            set(s_alias_tokens).intersection(set(t_alias_tokens))) > 0)

        # initialize similarity features
        name_token_jaccard_similarity = 1.0
        inverse_name_token_edit_distance = 1.0
        name_stem_jaccard_similarity = 1.0
        inverse_name_stem_edit_distance = 1.0
        name_lemm_jaccard_similarity = 1.0
        inverse_name_lemm_edit_distance = 1.0
        name_char_jaccard_similarity = 1.0
        inverse_name_char_edit_distance = 1.0

        # jaccard similarity and token edit distance
        max_changes = len(s_name_tokens) + len(t_name_tokens)
        max_char_changes = len(s_char_tokens) + len(t_char_tokens)

        if not has_same_canonical_name:
            name_token_jaccard_similarity = string_utils.get_jaccard_similarity(
                set(s_name_tokens), set(t_name_tokens))
            inverse_name_token_edit_distance = 1.0 - edit_distance(
                s_name_tokens, t_name_tokens) / max_changes

        if not has_same_stemmed_name:
            name_stem_jaccard_similarity = string_utils.get_jaccard_similarity(
                set(s_stem_tokens), set(t_stem_tokens))
            inverse_name_stem_edit_distance = 1.0 - edit_distance(
                s_stem_tokens, t_stem_tokens) / max_changes

        if not has_same_lemmatized_name:
            name_lemm_jaccard_similarity = string_utils.get_jaccard_similarity(
                set(s_lemm_tokens), set(t_lemm_tokens))
            inverse_name_lemm_edit_distance = 1.0 - edit_distance(
                s_lemm_tokens, t_lemm_tokens) / max_changes

        if not has_same_char_tokens:
            name_char_jaccard_similarity = string_utils.get_jaccard_similarity(
                set(s_char_tokens), set(t_char_tokens))
            inverse_name_char_edit_distance = 1 - edit_distance(
                s_char_tokens, t_char_tokens) / max_char_changes

        max_alias_token_jaccard = 0.0
        min_alias_edit_distance = 1.0
        best_s_alias = s_ent['aliases'][0]
        best_t_alias = t_ent['aliases'][0]

        if not has_alias_in_common:
            for s_ind, s_a_tokens in enumerate(s_alias_tokens):
                for t_ind, t_a_tokens in enumerate(t_alias_tokens):
                    if s_a_tokens and t_a_tokens:
                        j_ind = string_utils.get_jaccard_similarity(
                            set(s_a_tokens), set(t_a_tokens))
                        if j_ind > max_alias_token_jaccard:
                            max_alias_token_jaccard = j_ind
                            best_s_alias = s_ent['aliases'][s_ind]
                            best_t_alias = t_ent['aliases'][t_ind]
                        e_dist = edit_distance(s_a_tokens, t_a_tokens) / (
                            len(s_a_tokens) + len(t_a_tokens))
                        if e_dist < min_alias_edit_distance:
                            min_alias_edit_distance = e_dist

        # has any relationships
        has_parents = (len(s_ent['par_relations']) > 0
                       and len(t_ent['par_relations']) > 0)
        has_children = (len(s_ent['chd_relations']) > 0
                        and len(t_ent['chd_relations']) > 0)

        percent_parents_in_common = 0.0
        percent_children_in_common = 0.0

        # any relationships in common
        if has_parents:
            max_parents_in_common = (len(s_ent['par_relations']) +
                                     len(t_ent['par_relations'])) / 2
            percent_parents_in_common = len(
                s_ent['par_relations'].intersection(
                    t_ent['par_relations'])) / max_parents_in_common

        if has_children:
            max_children_in_common = (len(s_ent['chd_relations']) +
                                      len(t_ent['chd_relations'])) / 2
            percent_children_in_common = len(
                s_ent['chd_relations'].intersection(
                    t_ent['chd_relations'])) / max_children_in_common

        s_acronyms = [(i[0] for i in a) for a in s_alias_tokens]
        t_acronyms = [(i[0] for i in a) for a in t_alias_tokens]
        has_same_acronym = (len(set(s_acronyms).intersection(set(t_acronyms)))
                            > 0)

        s_name_root, s_name_heads = self._dependency_parse(
            s_ent['canonical_name'])
        t_name_root, t_name_heads = self._dependency_parse(
            t_ent['canonical_name'])

        has_same_name_root_word = (s_name_root == t_name_root)
        has_same_name_chunk_heads = (s_name_heads == t_name_heads)
        name_chunk_heads_jaccard_similarity = string_utils.get_jaccard_similarity(
            s_name_heads, t_name_heads)

        s_alias_root, s_alias_heads = self._dependency_parse(best_s_alias)
        t_alias_root, t_alias_heads = self._dependency_parse(best_t_alias)

        has_same_alias_root_word = (s_alias_root == t_alias_root)
        has_same_alias_chunk_heads = (s_alias_heads == t_alias_heads)
        alias_chunk_heads_jaccard_similarity = string_utils.get_jaccard_similarity(
            s_alias_heads, t_alias_heads)

        def_jaccard_similarity = string_utils.get_jaccard_similarity(
            set(s_def_tokens), set(t_def_tokens))

        # form feature vector
        feature_vec = [
            FloatField(float(has_same_canonical_name)),
            FloatField(float(has_same_stemmed_name)),
            FloatField(float(has_same_lemmatized_name)),
            FloatField(float(has_same_char_tokens)),
            FloatField(float(has_alias_in_common)),
            FloatField(name_token_jaccard_similarity),
            FloatField(inverse_name_token_edit_distance),
            FloatField(name_stem_jaccard_similarity),
            FloatField(inverse_name_stem_edit_distance),
            FloatField(name_lemm_jaccard_similarity),
            FloatField(inverse_name_lemm_edit_distance),
            FloatField(name_char_jaccard_similarity),
            FloatField(inverse_name_char_edit_distance),
            FloatField(max_alias_token_jaccard),
            FloatField(1.0 - min_alias_edit_distance),
            FloatField(percent_parents_in_common),
            FloatField(percent_children_in_common),
            FloatField(float(has_same_acronym)),
            FloatField(float(has_same_name_root_word)),
            FloatField(float(has_same_name_chunk_heads)),
            FloatField(name_chunk_heads_jaccard_similarity),
            FloatField(float(has_same_alias_root_word)),
            FloatField(float(has_same_alias_chunk_heads)),
            FloatField(alias_chunk_heads_jaccard_similarity),
            FloatField(def_jaccard_similarity)
        ]

        return feature_vec

    @overrides
    def text_to_instance(
            self,  # type: ignore
            s_ent: dict,
            t_ent: dict,
            label: str = None) -> Instance:
        # pylint: disable=arguments-differ

        # sample n from list l, keeping only entries with len less than max_len
        # if n is greater than the length of l, just return l
        def sample_n(l, n, max_len):
            l = [i for i in l if len(i) <= max_len]
            if not l:
                return ['00000']
            if len(l) <= n:
                return l
            return random.sample(l, n)

        fields: Dict[str, Field] = {}

        fields['sparse_features'] = ListField(
            self._get_features(self._normalize_ent(s_ent),
                               self._normalize_ent(t_ent)))

        # tokenize names
        s_name_tokens = self._tokenizer.tokenize('00000 ' +
                                                 s_ent['canonical_name'])
        t_name_tokens = self._tokenizer.tokenize('00000 ' +
                                                 t_ent['canonical_name'])

        # add entity name fields
        fields['s_ent_name'] = TextField(s_name_tokens,
                                         self._name_token_indexers)
        fields['t_ent_name'] = TextField(t_name_tokens,
                                         self._name_token_indexers)

        s_aliases = sample_n(s_ent['aliases'], 16, 128)
        t_aliases = sample_n(t_ent['aliases'], 16, 128)

        # add entity alias fields
        fields['s_ent_aliases'] = ListField([
            TextField(self._tokenizer.tokenize('00000 ' + a),
                      self._name_token_indexers) for a in s_aliases
        ])
        fields['t_ent_aliases'] = ListField([
            TextField(self._tokenizer.tokenize('00000 ' + a),
                      self._name_token_indexers) for a in t_aliases
        ])

        # add entity definition fields
        fields['s_ent_def'] = TextField(
            self._tokenizer.tokenize(
                s_ent['definition']), self._token_only_indexer
        ) if s_ent['definition'] else self._empty_token_text_field
        fields['t_ent_def'] = TextField(
            self._tokenizer.tokenize(
                t_ent['definition']), self._token_only_indexer
        ) if t_ent['definition'] else self._empty_token_text_field

        # add entity context fields
        s_contexts = sample_n(s_ent['other_contexts'], 16, 256)
        t_contexts = sample_n(t_ent['other_contexts'], 16, 256)

        fields['s_ent_context'] = ListField([
            TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
            for c in s_contexts
        ])
        fields['t_ent_context'] = ListField([
            TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
            for c in t_contexts
        ])

        # add boolean label (0 = no match, 1 = match)
        fields['label'] = BooleanField(label)

        return Instance(fields)

    @classmethod
    def from_params(cls, params: Params) -> 'OntologyMatchingDatasetReader':
        tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
        name_token_indexers = TokenIndexer.dict_from_params(
            params.pop('name_token_indexers', {}))
        token_only_indexer = TokenIndexer.dict_from_params(
            params.pop('token_only_indexer', {}))
        params.assert_empty(cls.__name__)
        return OntologyMatchingDatasetReader(
            tokenizer=tokenizer,
            name_token_indexers=name_token_indexers,
            token_only_indexer=token_only_indexer)
Esempio n. 32
0
documents = []
ignore_words = ['?', '\'']
# loop through each sentence in the training data
for pattern in training_data:
	# tokenize each work in the sentence
	w = nltk.word_tokenize(pattern['sentence'])
	# add to words list
	words.extend(w)
	# add to documents in corpus
	documents.append((w, pattern['class']))
	# add to classes list
	if pattern['class'] not in classes:
		classes.append(pattern['class'])

# stem and lower each work and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = list(set(words))

# remove duplicates
classes = list(set(classes))

print(len(documents), "documents")
print(len(classes), "classes", classes)
print(len(words), "words", words)

# create training data
training = []
output = []
# create an empty array for output data
output_empty = [0] * len(classes)
Esempio n. 33
0
	def matrix(self,domain,independent,domainb=[('without value','withoutvalue')]): # creates a  matrix M from the paper cross domain sentiment classification
 		  
 		  stemmer=SnowballStemmer("english")
 		  
 		  	
          ####################################################
		  domaincheck=domain
		  domainl=domain
		  
		  domain1,domain2=map(list, zip(*domainb))
		  domain1= list(map(stemmer.stem, domain1))
		  domain2= list(map(stemmer.stem, domain2))
		 
		  matrixM=DataFrame(0,index=domainl, columns=independent)
		  joinf=joindocuments(df1,df2)
		  undersampleddf=joinf.join(self.df1,self.df2)
		  for x in undersampleddf[self.column].values:

		    
			tokens = x.split()
			tokens=[x.lower() for x in tokens]
			
			stemm_words=[]
			tokens_clean=[]
			for j in tokens:
		      
				sa=re.sub('[^A-Za-z]+', '', j)
				tokens_clean.append(sa)
		    
			for s in tokens_clean:
				try:
				  stem= stemmer.stem(s)
				  if s!='':
				   stemm_words.append(str(stem)) 
				except:
				  pass

			
			inter=set(domain).intersection(stemm_words) #find the intersection two lists
			intersection1= list(inter)
			inter1=set(independent).intersection(stemm_words) #find the intersection two lists
			intersection2= list(inter1)
			inter3=set(domain1).intersection(stemm_words) #find the intersection two lists
			intersection3= list(inter3)
			inter4=set(domain2).intersection(stemm_words) #find the intersection two lists
			intersection4= list(inter4)

 
			if intersection1:
			    if intersection2:
			      
			      for  x in intersection1:
			        
			        for y in intersection2:
			        
			          	matrixM.xs(x)[y]=matrixM.xs(x)[y]+1
			if intersection3:
			    if intersection4:
			      if intersection2:
				      for  x1 in intersection3:
				        
				        for y1 in intersection4:
				        	for z1 in intersection2:
				        		label=x1+y1
				        		
				        		if label in domain:
				          			matrixM.xs(label)[z1]=matrixM.xs(label)[z1]+1          
		  
		  return matrixM
Esempio n. 34
0
 def stem_tokens(self, tokens):
     from nltk.stem.snowball import SnowballStemmer
     stemmer = SnowballStemmer("english")
     return [stemmer.stem(t) for t in tokens]
Esempio n. 35
0
class Data(Sent):

	def __init__(self, directory_path = '', process_code = 0, outpath = ''):
		# Whether or not we want to take a subset of the dataframe
		self.subset = config_subset
		self.process_code = process_code
		self.outpath = outpath

		self.dataframe = pd.read_pickle(directory_path + 'all_the_news.pkl')

		# if self.subset:
		# 	self.dataframe = pd.read_pickle(directory_path + 'all_the_news.pkl').sample(frac=config_subsample_size)
		# 	self.labels = self.dataframe.index.tolist()
		# 	self.dataframe = self.dataframe.reset_index(drop=True)
		# else:
		# 	self.dataframe = pd.read_pickle(directory_path + 'all_the_news.pkl')
		# 	self.labels = self.dataframe.index.tolist()

		#self.nlp = en_core_web_sm.load()

		self.stopwords = stopwords.words('english')#{s : True for s in stopwords.words('english')}
		self.stemmer = SnowballStemmer('english')
		self.lmtzr = WordNetLemmatizer()

	# Will process all the text in spacy if we want to. Lot's of different nlp options. Note, rate is about 5000
	# articles per hour on 16gb RAM
	def __spacy__(self):

		if self.subset:
			self.dataframe = self.dataframe.sample(frac=config_subsample_size).reset_index(drop=True)

		start = time()
		self.spacy_text = {}
		for idx, row in self.dataframe.iterrows():
			self.spacy_text[idx] =  self.nlp(row['content'])

			if not idx % 5000:
				print(idx, "rows in", (time()-start)/60, 'min')

	def stem_vocab(self, w_lemma=True):
		start = time()

		if w_lemma:
			self.pdata = [[self.lmtzr.lemmatize(self.stemmer.stem(word)) for word in j] for j in self.pdata]
			print('Stemming and lemmatization done in', (time()-start) / 60, 'min')
		else:
			self.pdata = [[self.stemmer.stem(word) for word in j] for j in self.pdata]
			print('Stemming done in', (time()-start) / 60, 'min')

	# The primary function that builds the processed data
	# Once run, the data that can be output is self.pdata
	# Could also just return pdata later on, if that's a better design choice
	def get_processed_data(self, author_threshold = 10, load = False):
		if load:
			# Placeholder
			pass
		else:
			article_df = self.dataframe

			# Data cleaning
			article_df = article_df[~article_df.author.isna() & ~article_df.title.isna()]

			dct = dict(Counter(article_df.author.tolist()))
			filtered_users = [key for key in dct.keys() if dct[key]>author_threshold]
			article_df = article_df[article_df.publication.isin(filtered_users)]

			if self.subset:
				article_df = article_df.sample(frac=config_subsample_size)

			article_df = article_df.set_index('id', drop=True)
			self.labels = article_df.index.tolist()
			article_df = article_df.reset_index()
			self.metadata = article_df.reset_index()[['id', 'title', 'publication', 'author', 'date', 'year', 'month']].copy().reset_index()

			self.metadata.id = self.metadata.id.fillna(-1).astype('int64')
			self.metadata.month =  self.metadata.month.fillna(-1).astype('int64')
			self.metadata.year = self.metadata.year.fillna(-1).astype('int64')

			label_output = pd.DataFrame({'id' : self.labels}).reset_index()

			if config_write_labels:
				label_output.to_csv(self.outpath + 'label_mapping_' + str(self.process_code) + '.csv', index=False)
				self.metadata.to_csv(self.outpath + 'metadata_by_mapping_' + str(self.process_code) + '.csv', index=False)

			self.pdata = article_df.content.tolist()

			start = time()

			self.pdata = [[i for i in re.sub(r'[^\w\s]','',c.lower()).split() if i not in self.stopwords] for c in self.pdata]
			# Numerical processing from homework assignment 4
			self.pdata = [['NUM' if re.match('[0-9]+', word) is not None else word for word in c ] for c in self.pdata]
			print('Simple splitting done in:', (time()-start)/60, 'min')
def alcohol_abuse_classifier(file_name):
    tree=ET.parse(file_name)
    raw_text = tree.find('.//TEXT').text

    clean_text = re.sub('\\n', ' ', raw_text)
    clean_text = re.sub('\\t','', clean_text)
    clean_text = re.sub('[\s]{2,}', ' ', clean_text)

    sentences = nltk.sent_tokenize(clean_text)

    hotspot_lines = set()

    for i in sentences:
        # filter out anything non-alphabetical characters and a few special characters
        tokenizer = RegexpTokenizer(r'[a-zA-Z\/\']+')
        token = tokenizer.tokenize(i)

        snowball_stemmer = SnowballStemmer("english")
        stemmed_tokens = [snowball_stemmer.stem(word.lower()) for word in token]

        drink_score = 0
        abuse_score = 0
        token_count = len(stemmed_tokens)

        for j in range(token_count):
            if stemmed_tokens[j] in stemmed_alcohol:
                drink_score += 1

                # Negation detection in left direction
                for i in range(1, left_negation+1):
                    if (j >= i) and (stemmed_tokens[j - i] in stemmed_negation):
                        drink_score = 0
                        break

                # Negation detection in right direciton
                for i in range(1, right_negation+1):
                    if (j < token_count - i) and (stemmed_tokens[j + i] in stemmed_negation):
                        drink_score = 0
                        break

                # Modifier detection in left direction
                for i in range(1, left_modifier+1):
                    if (j >= i) and (stemmed_tokens[j - i] in stemmed_alcohol_modifer):
                        abuse_score += 1

                # Modifier detection in right direciton
                for i in range(1, right_modifier+1):
                    if (j < token_count - i) and (stemmed_tokens[j + i] in stemmed_alcohol_modifer):
                        abuse_score += 1

            # Mental Health Detection
            elif stemmed_tokens[j] in stemmed_alcohol_mental:
                abuse_score += 1

        if drink_score >= 1 and abuse_score >= 1:
            hotspot_lines.add(i)

    if hotspot_lines:
        return '<ALCOHOL-ABUSE met="met" />'
    else:
        return '<ALCOHOL-ABUSE met="not met" />'
Esempio n. 37
0
 def test_russian(self):
     # Russian words both consisting of Cyrillic
     # and Roman letters can be stemmed.
     stemmer_russian = SnowballStemmer("russian")
     assert stemmer_russian.stem("авантненькая") == "авантненьк"
     assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k"
Esempio n. 38
0
 def test_short_strings_bug(self):
     stemmer = SnowballStemmer('english')
     assert stemmer.stem("y's") == 'y'
print(c.most_common())
'''
#COLLECTING TOP 10 WORDS IN TWEETS
tweets_lst = []
punc = string.punctuation

for item in data:
    tweets = item['text']
    for i in tweets.split():
        if '@' not in i and 'http' not in i:
            table = str.maketrans(
                {key: None
                 for key in string.punctuation}
            )  #https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
            text_nopunc = i.translate(table)
            text_stem = ss.stem(text_nopunc)
            #print(text_nopunc)
            tweets_lst.append(text_nopunc.lower())

stopwords = nltk.corpus.stopwords.words('english')

#Include unique twitter words in stopwords
#new_words = ['fortnite', 'one', 'follow', 'win', 'vbuck', 'retweet', 'enter', 'give', 'pick', 'play']
#for n in new_words:
#stopwords.append(n)

tweets_final = []

for i in tweets_lst:
    if i not in stopwords and len(i) > 2:
        tweets_final.append(i)
Esempio n. 40
0
class Importer(object):
    logging.basicConfig(format='%(asctime)s : %(levelname)s :: %(message)s', level=logging.DEBUG)

    def __init__(self, arg_document_count_limit=sys.maxint, arg_process_text_part=True, arg_process_html_part=False,
                 arg_process_both_empty=False):
        self.document_count_limit = arg_document_count_limit
        self.process_text_part = arg_process_text_part
        self.process_html_part = arg_process_html_part
        self.process_both_empty = arg_process_both_empty
        self.stemmer = SnowballStemmer("english")

        pass

    # http://brandonrose.org/clustering (with some modifications)
    @staticmethod
    def strip_proppers(arg_text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it'sown token
        tokens = [word for sent in nltk.sent_tokenize(arg_text) for word in nltk.word_tokenize(sent)
                  if word.islower()]
        # todo get the startswiths and endswiths right here
        return "".join(
            [" " + i if not i.startswith("'") and not i.startswith("/") and not i.endswith(
                "=") and i not in string.punctuation else i for i in tokens]).strip()

    # http://brandonrose.org/clustering
    def tokenize_and_stem(self, arg_text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it'sown token
        tokens = [current_word for sent in nltk.sent_tokenize(arg_text) for current_word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)
        stems = [self.stemmer.stem(token) for token in filtered_tokens]
        return stems

    def process_folder(self, arg_folder, arg_bulk_upload, arg_document_type, arg_buffer_limit, arg_server,
                       arg_index_name, arg_kmeans_dictionary):
        document_count = 0
        document_buffer = []
        indexed_count = 0
        error_count = 0
        for root, subdirectories, files in os.walk(arg_folder):
            for current in files:
                if document_count < self.document_count_limit:
                    current_full_file_name = os.path.join(root, current)
                    # logging.debug("%d %s", document_count, current_full_file_name)

                    current_json, document_id = self.get_json(current_full_file_name,
                                                              arg_process_text_part=self.process_text_part,
                                                              arg_process_html_part=self.process_html_part,
                                                              arg_process_both_empty=self.process_both_empty,
                                                              arg_kmeans_cluster_dictionary=arg_kmeans_dictionary)
                    # logging.debug(current_json)
                    document_count += 1
                    try:
                        if arg_bulk_upload:
                            wrapped = {'_type': arg_document_type, '_source': current_json}
                            document_buffer.append(wrapped)

                            if len(document_buffer) == arg_buffer_limit:
                                try:
                                    index_result = elasticsearch.helpers.bulk(arg_server, document_buffer,
                                                                              index=arg_index_name,
                                                                              request_timeout=1000)
                                    logging.debug(index_result)
                                    indexed_count += len(document_buffer)
                                    document_buffer = []
                                except elasticsearch.exceptions.ConnectionTimeout as connectionTimeout:
                                    logging.warn(connectionTimeout)
                                    document_buffer = []
                        else:
                            index_result = arg_server.index(index=arg_index_name, doc_type=arg_document_type,
                                                            body=current_json, id=document_id)
                            indexed_count += 1
                            logging.debug("id: %s, result: %s", document_id, index_result)
                    except elasticsearch.exceptions.SerializationError as serializationError:
                        logging.warn(serializationError)
                        error_count += 1
        # need to flush the pending buffer
        if arg_bulk_upload and len(document_buffer) > 0:
            index_result = elasticsearch.helpers.bulk(arg_server, document_buffer, index=arg_index_name)
            logging.debug(index_result)

    target_encoding = 'utf-8'

    # https://groups.google.com/forum/#!topic/microsoft.public.outlookexpress.general/oig7-xNFISg
    clean_address_tokens = ['=?us-ascii?Q?', '=0D=0A_=28', '=?utf-8?Q?', '=29?=', '=0D=0A']

    def clean_address(self, argvalue):
        result = str(argvalue)
        for token in self.clean_address_tokens:
            if token in result:
                result = result.replace(token, ' ')
        return result.lower().strip()

    @staticmethod
    def get_references(current_file):
        result = {}
        with open(current_file, 'rb') as fp:
            message = pyzmail.message_from_file(fp)
            if 'Message-Id' in message.keys():
                result['message-id'] = message['Message-Id']
            elif 'Message-ID' in message.keys():
                result['message-id'] = message['Message-ID']
            else:
                logging.warn('no message id in file %s', current_file)
            if 'References' in message.keys():
                references = message['References'].split(' ')
                result['references'] = references
        return result

    def get_json(self, current_file, arg_process_text_part, arg_process_html_part, arg_process_both_empty,
                 arg_kmeans_cluster_dictionary):
        result = {'original_file': current_file}
        with open(current_file, 'rb') as fp:
            message = pyzmail.message_from_file(fp)
            # todo clean up internal whitespace
            senders = message.get_addresses('from')
            result['sender'] = [item[i] for i in [0, 1] for item in senders]
            result['short_sender'] = [item.split('@')[0] for item in result['sender']]
            clean_senders = [self.clean_address(item[1]) for item in senders]
            result['clean_sender'] = clean_senders

            # todo clean up internal whitespace
            recipients = message.get_addresses('to') + message.get_addresses('cc') + message.get_addresses('bcc')
            result['recipient'] = recipients
            result['party'] = list(
                ['{name} = {address}'.format(name=item[0], address=item[1]) for item in senders + recipients])
            result['clean_recipient'] = [self.clean_address(item[1]) for item in recipients]
            result['recipient'] = [item[i] for i in [0, 1] for item in recipients]
            result['short_recipient'] = [item.split('@')[0] for item in result['clean_recipient']]

            subject = message.get('subject')
            result['subject'] = '' if subject is None else subject.decode('iso-8859-1').encode(self.target_encoding)

            raw_date = message.get('date')
            if raw_date is not None:
                try:
                    result['date'] = dateutil.parser.parse(raw_date)
                except ValueError as valueError:
                    # todo find a way to deal with these special cases?
                    # we occasionally get a string the parser won't parse e.g.
                    # Wed, 17 Dec 2008 12:35:42 -0700 (GMT-07:00)
                    # and we need to drop off the trailing time zone and try to parse again
                    logging.warn('%s %s %s', raw_date, valueError, current_file)
                    pieces = str(raw_date).split('(')
                    result['date'] = dateutil.parser.parse(pieces[0])
            else:
                # todo add special code to handle these?
                logging.warn('no date: %s ', message)

            text_part = message.text_part
            if text_part is not None and arg_process_text_part:
                charset = text_part.charset
                payload = text_part.get_payload()
                if charset is not None:
                    try:
                        body = payload.decode(charset, 'ignore').encode(self.target_encoding)
                    except LookupError as lookupError:
                        if text_part.charset == 'iso-8859-8-i':
                            body = payload.decode('iso-8859-8', 'ignore').encode(self.target_encoding)
                        else:
                            body = payload.decode('utf-8', 'ignore').encode(self.target_encoding)
                            logging.warn('lookup error %s', lookupError)
                else:
                    body = payload.decode('utf-8', 'ignore').encode(self.target_encoding)
                result['body'] = body

                short_file_name = os.path.basename(current_file)
                result['kmeans_cluster'] = arg_kmeans_cluster_dictionary[short_file_name]

            elif message.html_part is not None and arg_process_html_part:
                payload = message.html_part.part.get_payload()
                payload_text = bs4.BeautifulSoup(payload, 'lxml').get_text().strip()
                charset = message.html_part.charset if message.html_part.charset is not None else 'utf-8'
                result['body'] = payload_text.decode(charset, 'ignore').encode(self.target_encoding)
            elif arg_process_both_empty:
                logging.warn('both text_part and html_part are None: %s', current_file)
            else:
                logging.warn('not processing %s', current_file)

            if 'body' in result.keys():
                if len(result['body']) == 0:
                    result['empty_body'] = True

            if 'Message-Id' in message.keys():
                result['message-id'] = message['Message-Id']
            if 'In-Reply-To' in message.keys():
                result['in-reply-to'] = message['In-Reply-To']
            if 'References' in message.keys():
                result['references'] = message['References'].split(' ')

        md5 = hashlib.md5()
        with open(current_file, 'rb') as fp:
            md5.update(fp.read())

        return result, md5.hexdigest()
Esempio n. 41
0
def problem7b(debate):
    porterStemmer = PorterStemmer()
    snowballStemmer = SnowballStemmer("english", ignore_stopwords=False)
    lancasterStemmer = LancasterStemmer()
   # cachedStopWords = stopwords.words("english")
    tokenizer = RegexpTokenizer(r'\w+')
    stemDict = {'LEHRER': {},'OBAMA': {}, 'ROMNEY': {}}
    
    LEHRER = debate['LEHRER']
    OBAMA = debate['OBAMA']
    ROMNEY = debate['ROMNEY']
    
    LEHRER = "".join(LEHRER)
    LEHRER = tokenizer.tokenize(LEHRER)
    LEHRER = ' '.join([word.lower() for word in LEHRER if word not in stopwords.words("english")])
    
    pstemmed_words = ' '.join([porterStemmer.stem(word) for word in LEHRER.split(' ')])
    stemDict['LEHRER'].update({'porterStemmer':pstemmed_words})
    stemDict11 = stemDict['LEHRER']['porterStemmer']
    print("\n\n\nLEHRER:porterStemmer\n\n ",stemDict11)
    
    sstemmed_words = ' '.join([snowballStemmer.stem(word) for word in LEHRER.split(' ')])
    stemDict['LEHRER'].update({'snowballStemmer':sstemmed_words})
    stemDict12 = stemDict['LEHRER']['porterStemmer']
    print("\n\n\nLEHRER:snowballStemmer \n\n ",stemDict12)
    
    lstemmed_words = ' '.join([lancasterStemmer.stem(word) for word in LEHRER.split(' ')])
    stemDict['LEHRER'].update({'lancasterStemmer':lstemmed_words})
    stemDict13 = stemDict['LEHRER']['lancasterStemmer']
    print("\n\n\nLEHRER:lancasterStemmer \n\n ",stemDict13)
    
    
    
    OBAMA = "".join(OBAMA)
    OBAMA = tokenizer.tokenize(OBAMA)
    OBAMA = ' '.join([word.lower() for word in OBAMA if word not in stopwords.words("english")])
    
    pstemmed_words = ' '.join([porterStemmer.stem(word) for word in OBAMA.split(' ')])
    stemDict['OBAMA'].update({'porterStemmer':pstemmed_words})
    stemDict21 = stemDict['OBAMA']['porterStemmer']
    print("\n\n\nOBAMA:porterStemmer\n\n ",stemDict21)
	
    sstemmed_words = ' '.join([snowballStemmer.stem(word) for word in OBAMA.split(' ')])
    stemDict['OBAMA'].update({'snowballStemmer':sstemmed_words})
    stemDict22 = stemDict['OBAMA']['porterStemmer']
    print("\n\n\nOBAMA:snowballStemmer \n\n ",stemDict22)
	
    lstemmed_words = ' '.join([lancasterStemmer.stem(word) for word in OBAMA.split(' ')])
    stemDict['OBAMA'].update({'lancasterStemmer':lstemmed_words})
    stemDict23 = stemDict['OBAMA']['lancasterStemmer']
    print("\n\n\nOBAMA:lancasterStemmer \n\n ",stemDict23)
    
    
    ROMNEY = "".join(ROMNEY)
    ROMNEY = tokenizer.tokenize(ROMNEY)
    ROMNEY = ' '.join([word.lower() for word in ROMNEY if word not in stopwords.words("english")])
    
    pstemmed_words = ' '.join([porterStemmer.stem(word) for word in ROMNEY.split(' ')])
    stemDict['ROMNEY'].update({'porterStemmer':pstemmed_words})
    stemDict31 = stemDict['ROMNEY']['porterStemmer']
    print("\n\n\nROMNEY:porterStemmer\n\n ",stemDict31)
	
    sstemmed_words = ' '.join([snowballStemmer.stem(word) for word in ROMNEY.split(' ')])
    stemDict['ROMNEY'].update({'snowballStemmer':sstemmed_words})
    stemDict32 = stemDict['ROMNEY']['porterStemmer']
    print("\n\n\nROMNEY:snowballStemmer \n\n ",stemDict32)
	
    lstemmed_words = ' '.join([lancasterStemmer.stem(word) for word in ROMNEY.split(' ')])
    stemDict['ROMNEY'].update({'lancasterStemmer':lstemmed_words})
    stemDict33 = stemDict['ROMNEY']['lancasterStemmer']
    print("\n\n\nROMNEY:lancasterStemmer \n\n ",stemDict33)
    
    return stemDict#{stemDict['LEHRER']['porterStemmer']:stemDict11, stemDict['OBAMA']['porterStemmer']:stemDict21 ,stemDict['ROMNEY']['porterStemmer']:stemDict31};
Esempio n. 42
0
from nltk.corpus import stopwords
sw = stopwords.words("english")
print 'Amount of English stopwords: ', len(sw)
sw = stopwords.words("russian")
print 'Amount of Russian stopwords: ', len(sw)
for word in sw:
    print word
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
stemmer.stem("responsiveness")
def ball_stemming(text):
    stemmer = SnowballStemmer("english")
    tokens = word_tokenize(text)
    stemmas = [stemmer.stem(token) for token in tokens]
    return stemmas
Esempio n. 44
0
email_list = [string1,string2]

vectorizer.fit(email_list)


bag_of_words = vectorizer.transform(email_list)

print (bag_of_words)

print (vectorizer.vocabulary_.get('great'))

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

stemmer.stem("responsiveness")


stem_rus = SnowballStemmer("russian")

stem_rus.stem("Плотный")

in_data = "Плотный Продам свитер из Англии, фирма Woolovers, 100% хлопок. Не подошел размер. Свитер идет на 56-58 примерно размер. Плотный, не тонкий. Отдаю за свою цену, перезакажу меньший размер."

list = in_data.split(" ")

stem_rus.stem(list[2])


from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import FrenchStemmer
Esempio n. 45
0
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['sentence']) #It's actually paragraph, but whatever.

t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

txt_stem = []

for row in df['sentence']:
    cleaned_sent = re.sub("[^A-Za-z']+", ' ', str(row)).lower()
    txt_stem.append([ Snow.stem(word) for word in cleaned_sent])



print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

df_clean = pd.DataFrame({'clean': txt.extend(txt_stem)})
df_clean = df_clean.dropna().drop_duplicates()

df_clean.to_sql('cleaned_TDS', conn, if_exists='replace', index=False)

conn.commit()
conn.close()
Esempio n. 46
0
## extract sentences from paragraph
fullSents = nltk.sent_tokenize(content)

## extract words from sentences
## find parts of speech of words
wordsInSents = list()
posInSents = list()
for sent in fullSents:
    sent_token = nltk.word_tokenize(sent)  # word tokenize the sentence
    sent_pos = nltk.pos_tag(sent_token)  # tag with part of speech
    stemWords = list()  # create list to store stemmed words
    onlyPOS = list()  # create list to store sentence parts of speech
    for item in sent_pos:
        word = item[0]
        stemWords.append(p_stemmer.stem(
            word.lower()))  # lower case and stem words
        onlyPOS.append(item[1])
    wordsInSents.append(stemWords)
    posInSents.append(onlyPOS)

## go through each word and the pos of each word
## to see if it matches the 600 feature names of the
## finalized model to create the 600 element binary
## vector
featureNames = np.load("../Data/FeatureName_finalized_model.npy")
featureMat = np.zeros((len(wordsInSents), len(featureNames)))
for i in range(0, len(wordsInSents)):
    words = wordsInSents[i]
    POSes = posInSents[i]
    for j in range(0, len(featureNames)):
        name = featureNames[j]
    rtext = re.sub(r'[Yy]ear', '', rtext)
    return rtext


#cleaning the text for each case
cleaned_text = [clean(each_case) for each_case in text]
cleaned_text = [clean_number(each_case) for each_case in cleaned_text]

#creating a simple tfidf with just unigram
#remove stopwords
stopwords = nltk.corpus.stopwords.words('english')

#tokenize each word and stem
tokenized_text = [nltk.word_tokenize(each_case) for each_case in cleaned_text]
tokenized_text = [[
    stemmer.stem(word) for word in each_case if word not in stopwords
] for each_case in tokenized_text]

tot_text = list(chain.from_iterable(tokenized_text))
fdist = FreqDist(tot_text)
wordList = fdist.values()
wordArray = np.array(wordList)
print '50% quantile word count of', np.percentile(wordArray, 50)
print fdist.most_common(30)
#plotting fdist on a cumulative chart
fdist.plot(30, cumulative=True)
#plotting fdist on a non cumulative chart
fdist.plot(30)
print 'seldom appearing words:', fdist.hapaxes()

tfidf_text = []
Esempio n. 48
0
predictedTest, tweetsTest, topicTest = obtainPredictedAndTweets('TweetsDownloaded/testData/test.txt')


# http://www.nltk.org/api/nltk.tokenize.html
tknzr = TweetTokenizer()
stemmer = SnowballStemmer("english")
vectorizer = TfidfVectorizer(analyzer = "word",
                           tokenizer = None,
                           preprocessor = None,
                           stop_words = 'english'
                            )


tw = []
for t,statement in enumerate(tweets):
    tw.append(' '.join(stemmer.stem(i) for i in tknzr.tokenize(statement)))

twt = []
for t,statement in enumerate(tweetsTest):
    twt.append(' '.join(stemmer.stem(i) for i in tknzr.tokenize(statement)))

train = tw
sentimentTrain = predicted
test = twt
sentimentTest = predictedTest


train, test, sentimentTrain, sentimentTest = cross_validation.train_test_split(tw, predicted, test_size=0.3, random_state=15)

train_data_features= vectorizer.fit_transform(train)
train_data_features = train_data_features.toarray()
Esempio n. 49
0
print "We read the file contents. Size %d bytes" % (len(fileContents))

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
words = tokenizer.tokenize(fileContents)
# print words

print "No. of words before prep : %d " % (len(words))

good_words = []
#remove stopwords and numbers
for w in words:
    if w.lower() not in stopwords.words():
        if not w.isdigit():
            good_words.append(w.lower())

print "No. of words without puctuation: %d" % (len(good_words))

stemmer = SnowballStemmer("english")

stemmed_words = [stemmer.stem(w) for w in good_words]

print "No. of stemmed words : %d" % (len(stemmed_words))

fdist = nltk.FreqDist(stemmed_words)

print "Top 50 Words"
print fdist.most_common(50)

#print words
Esempio n. 50
0
def keyurFeatures(rawSentence):
    # TODO get a proper lexicon of intensifiers
    intensifiers = set([])
    with open('dataset/lexicon/intensifiers.txt', 'r') as intfile:
        for line in intfile:
            for word in line.split():
                intensifiers.add(word)
    rawSentence = nltk.word_tokenize(rawSentence)
    sentence = []
    for token in rawSentence:
        match = re.search(r'^[.,?!-()";:\']+$', token)
        if match == None:
            sentence.append(token)
    sentence = nltk.pos_tag(sentence)
    stemmer = SnowballStemmer("english")
    features = []
    lexicon = loadLexicon('dataset/lexicon/subjclueslen1-HLTEMNLP05.tff')
    for i in range(len(sentence)):
        feature = []
        # the word token and part of speech
        feature.append(stemmer.stem(sentence[i][0]))
        feature.append(sentence[i][1])
        # word context: before, this, after
        if i > 0:
            feature.append(sentence[i - 1][0])
        else:
            feature.append('')
        feature.append(sentence[i][0])
        if i < len(sentence) - 1:
            feature.append(sentence[i + 1][0])
        else:
            feature.append('')
        try:
            feature.append(lexicon[sentence[i][0]]['priorpolarity'])
        except KeyError:
            feature.append('none')
        try:
            feature.append(lexicon[sentence[i][0]]['type'])
        except KeyError:
            feature.append('none')
        if i > 0:
            prevTag = sentence[i - 1][1]
            # preceded by adjective
            if prevTag[0] == 'J' and prevTag[1] == 'J':
                feature.append(True)
            else:
                feature.append(False)
            # preceded by adverb other than not
            if prevTag[0] == 'R' and prevTag[1] == 'B' and sentence[
                    i - 1][0].lower() != 'not':
                feature.append(True)
            else:
                feature.append(False)
            # preceded by intensifier
            if sentence[i - 1][0].lower() in intensifiers:
                feature.append(True)
            else:
                feature.append(False)
        else:
            feature.append(False)
            feature.append(False)
            feature.append(False)
        # is intensifier
        if sentence[i][0].lower() in intensifiers:
            feature.append(True)
        else:
            feature.append(False)
        features.append(feature)
    return features
Esempio n. 51
0
####Remove stop words
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

tokenized_words = [word for word in tokenized_words if word not in stop_words]
tokenized_words

###reduced to 27

###Stemming
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english", ignore_stopwords=True)
tokenized_words = [stemmer.stem(word) for word in tokenized_words]

tokenized_words

final_synopsis = []
for r in range(len(movie_synopsis_nomissing)):
    input_document = movie_synopsis_nomissing[r].lower()
    input_document = re.sub(r'\d+', '', input_document)
    input_document = strip_punctuation(input_document)
    input_words = word_tokenize(input_document)
    input_words = [word for word in input_words if word not in stop_words]
    input_words = [stemmer.stem(word) for word in input_words]
    clean_synopsis = ' '.join(map(str, input_words))
    final_synopsis.append(clean_synopsis)

from sklearn.feature_extraction.text import CountVectorizer
Esempio n. 52
0
# In[7]:

print("Cleaning dataset...")
t0 = time()
stopwords = load_stopwords(stopword_file)
stemmer = SnowballStemmer('english', ignore_stopwords=True)
stemmer.stopwords = stopwords

clean_data_samples = []
for sample in data_samples:
    clean_sample = ''
    for token in re.split("'(?!(d|m|t|ll|ve)\W)|[.,\-_!?:;()0-9@=+^*`~#$%&| \t\n\>\<\"\\\/\[\]{}]+", sample.lower().decode('utf-8')):
        if not token or token in stopwords:
            continue
        if stemming:
            token = stemmer.stem(token)
        if len(token) < minlength:
            continue
        clean_sample = clean_sample + ' ' + token
    clean_data_samples.append(clean_sample)
print("done in %0.3fs." % (time() - t0))


# # TF-IDF / TF Vectors

# In[8]:

# tf (raw term count)
print("Extracting tf features...")
tf_vectorizer =CountVectorizer(max_df=0.95, stop_words='english',
                               max_features=n_features)
Esempio n. 53
0
plt.axis('off')
plt.show()

stemmer = SnowballStemmer("english", ignore_stopwords=True)
text_1 = df_2["tweet"].to_csv()
tokenizer = RegexpTokenizer(r"\w+")
word_tokens = tokenizer.tokenize(text_1)
filtered_sentence = [w for w in word_tokens if not w in stopwords]

filtered_sentence = []

for w in word_tokens:
    if w not in stopwords:
        lemmatizer = WordNetLemmatizer()
        w = lemmatizer.lemmatize(w)
        w = stemmer.stem(w)
        filtered_sentence.append(w)

split_it = str(filtered_sentence).split()

from collections import Counter
Counter = Counter(split_it)
most_occur = Counter.most_common(60)

frequentwords = pd.DataFrame(most_occur).rename(columns={0:"words", 1:"Frequencies"}).\
                sort_values(by="Frequencies").reset_index(drop=True)

frequentwords = frequentwords.drop(index=[6, 7, 15, 26, 41, 47, 56, 57]).reset_index(drop=True)
print(frequentwords)

fig = px.bar(frequentwords, x="Frequencies", y="words", orientation="h", height=1000,
Esempio n. 54
0
def cli_main():
    #parser = argparse.ArgumentParser(description=metrics_description, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser = argparse.ArgumentParser(description="predictor")
    parser.add_argument('--config-file',
                        type=str,
                        help='config file with metric parameters')
    parser.add_argument('--metrics',
                        type=str,
                        help='comma-separated string of metrics')
    parser.add_argument('--aggregate',
                        type=bool,
                        help='whether to aggregate scores')
    parser.add_argument('--jsonl-file',
                        type=str,
                        help='input jsonl file to score')
    parser.add_argument('--article-file', type=str, help='input article file')
    parser.add_argument('--summ-file', type=str, help='input summary file')
    parser.add_argument('--ref-file', type=str, help='input reference file')
    parser.add_argument('--output-file', type=str, help='output file')
    parser.add_argument(
        '--eos',
        type=str,
        help='EOS for ROUGE (if reference not supplied as list)')
    args = parser.parse_args()

    # =====================================
    # INITIALIZE METRICS
    gin.parse_config_file(args.config_file)
    toks_needed = set()
    metrics = [x.strip() for x in args.metrics.split(",")]
    metrics_dict = {}
    if "rouge" in metrics:
        from summ_eval.rouge_metric import RougeMetric
        metrics_dict["rouge"] = RougeMetric()
        toks_needed.add("line_delimited")

    if "bert_score" in metrics:
        from summ_eval.bert_score_metric import BertScoreMetric
        bert_score_metric = BertScoreMetric()
        metrics_dict["bert_score"] = bert_score_metric
        toks_needed.add("space")
    if "mover_score" in metrics:
        from summ_eval.mover_score_metric import MoverScoreMetric
        mover_score_metric = MoverScoreMetric()
        metrics_dict["mover_score"] = mover_score_metric
        toks_needed.add("space")
    if "chrf" in metrics:
        from summ_eval.chrfpp_metric import ChrfppMetric
        metrics_dict["chrf"] = ChrfppMetric()
        toks_needed.add("space")
    if "meteor" in metrics:
        from summ_eval.meteor_metric import MeteorMetric
        metrics_dict["meteor"] = MeteorMetric()
        toks_needed.add("space")
    if "bleu" in metrics:
        from summ_eval.bleu_metric import BleuMetric
        metrics_dict["bleu"] = BleuMetric()
        toks_needed.add("space")
    if "cider" in metrics:
        from summ_eval.cider_metric import CiderMetric
        metrics_dict["cider"] = CiderMetric()
        toks_needed.add("stem")

    if "s3" in metrics:
        from summ_eval.s3_metric import S3Metric
        metrics_dict["s3"] = S3Metric()
        toks_needed.add("stem")
    if "rouge_we" in metrics:
        from summ_eval.rouge_we_metric import RougeWeMetric
        metrics_dict["rouge_we"] = RougeWeMetric()
        toks_needed.add("stem")

    if "stats" in metrics:
        from summ_eval.data_stats_metric import DataStatsMetric
        metrics_dict['stats'] = DataStatsMetric()
        toks_needed.add("spacy")
    if "sms" in metrics:
        from summ_eval.sentence_movers_metric import SentenceMoversMetric
        metrics_dict['sms'] = SentenceMoversMetric()
        toks_needed.add("spacy")
    if "summaqa" in metrics:
        from summ_eval.summa_qa_metric import SummaQAMetric
        metrics_dict['summaqa'] = SummaQAMetric()
        toks_needed.add("spacy")
        toks_needed.add("space")
    if "syntactic" in metrics:
        from summ_eval.syntactic_metric import SyntacticMetric
        metrics_dict["syntactic"] = SyntacticMetric()
        toks_needed.add("space")
    if "supert" in metrics:
        from summ_eval.supert_metric import SupertMetric
        metrics_dict['supert'] = SupertMetric()
        toks_needed.add("space")
    if "blanc" in metrics:
        from summ_eval.blanc_metric import BlancMetric
        metrics_dict['blanc'] = BlancMetric()
        toks_needed.add("space")
    # =====================================

    # =====================================
    # READ INPUT
    print("Reading the input")
    ids = []
    articles = []
    references = []
    summaries = []
    bad_lines = 0
    if args.jsonl_file is not None:
        try:
            with open(args.jsonl_file) as inputf:
                for count, line in enumerate(inputf):
                    try:
                        data = json.loads(line)
                        try:
                            ids.append(data['id'])
                        except:
                            pass
                        if len(data['decoded']) == 0:
                            bad_lines += 1
                            continue
                        summaries.append(data['decoded'])
                        references.append(data['reference'])
                        if "summaqa" in metrics or "stats" in metrics or "supert" in metrics or "blanc" in metrics:
                            try:
                                articles.append(data['text'])
                            except:
                                raise ValueError("You specified summaqa and stats, which" \
                                    "require input articles, but we could not parse the file!")
                    except:
                        bad_lines += 1
        except Exception as e:
            print("Input did not match required format")
            print(e)
            sys.exit()
        print(f"This many bad lines encountered during loading: {bad_lines}")

    if args.summ_file is not None:
        with open(args.summ_file) as inputf:
            summaries = inputf.read().splitlines()
    if args.ref_file is not None:
        with open(args.ref_file) as inputf:
            references = inputf.read().splitlines()
    if "summaqa" in metrics or "stats" in metrics or "supert" in metrics or "blanc" in metrics:
        if args.article_file is None and len(articles) == 0:
            raise ValueError("You specified summaqa and stats, which" \
                 "require input articles, but we could not parse the file!")
        if len(articles) > 0:
            pass
        else:
            with open(args.article_file) as inputf:
                articles = inputf.read().splitlines()
    if len(ids) == 0:
        ids = list(range(0, len(summaries)))
    # =====================================

    # =====================================
    # TOKENIZATION
    print("Preparing the input")
    references_delimited = None
    summaries_delimited = None
    if len(references) > 0:
        if isinstance(references[0], list):
            if "line_delimited" in toks_needed:
                references_delimited = ["\n".join(ref) for ref in references]
            if "space" in toks_needed:
                references_space = [" ".join(ref) for ref in references]
        elif args.eos is not None:
            if "line_delimited" not in toks_needed:
                raise ValueError(
                    'You provided a delimiter but are not using a metric which requires one.'
                )
            if args.eos == "\n":
                references_delimited = [
                    ref.split(args.eos) for ref in references
                ]
            else:
                references_delimited = [
                    f"{args.eos}\n".join(ref.split(args.eos))
                    for ref in references
                ]
        elif "line_delimited" in toks_needed:
            references_delimited = references
        if "space" in toks_needed:
            references_space = references

    if isinstance(summaries[0], list):
        if "line_delimited" in toks_needed:
            summaries_delimited = ["\n".join(summ) for summ in summaries]
        if "space" in toks_needed:
            summaries_space = [" ".join(summ) for summ in summaries]
    elif args.eos is not None:
        if "line_delimited" not in toks_needed:
            raise ValueError(
                'You provided a delimiter but are not using a metric which requires one.'
            )
        if args.eos == "\n":
            summaries_delimited = [ref.split(args.eos) for ref in summaries]
        else:
            summaries_delimited = [
                f"{args.eos}\n".join(ref.split(args.eos)) for ref in summaries
            ]
    elif "line_delimited" in toks_needed:
        summaries_delimited = summaries
    if "space" in toks_needed:
        summaries_space = summaries

    if "stem" in toks_needed:
        tokenizer = RegexpTokenizer(r'\w+')
        stemmer = SnowballStemmer("english")
        if isinstance(summaries[0], list):
            summaries_stemmed = [[
                stemmer.stem(word)
                for word in tokenizer.tokenize(" ".join(summ))
            ] for summ in summaries]
            references_stemmed = [[
                stemmer.stem(word)
                for word in tokenizer.tokenize(" ".join(ref))
            ] for ref in references]
        else:
            summaries_stemmed = [[
                stemmer.stem(word) for word in tokenizer.tokenize(summ)
            ] for summ in summaries]
            references_stemmed = [[
                stemmer.stem(word) for word in tokenizer.tokenize(ref)
            ] for ref in references]
        summaries_stemmed = [" ".join(summ) for summ in summaries_stemmed]
        references_stemmed = [" ".join(ref) for ref in references_stemmed]

    if "spacy" in toks_needed:
        try:
            nlp = spacy.load('en_core_web_md')
        except OSError:
            print(
                'Downloading the spacy en_core_web_md model\n'
                "(don't worry, this will only happen once)",
                file=stderr)
            from spacy.cli import download
            download('en_core_web_md')
            nlp = spacy.load('en_core_web_md')
        disable = ["tagger", "textcat"]
        if "summaqa" not in metrics:
            disable.append("ner")
        if isinstance(summaries[0], list):
            summaries_spacy = [
                nlp(" ".join(text), disable=disable) for text in summaries
            ]
        else:
            summaries_spacy = [
                nlp(text, disable=disable) for text in summaries
            ]
        if "stats" in metrics:
            summaries_spacy_stats = [[tok.text for tok in summary]
                                     for summary in summaries_spacy]
        if "sms" in metrics:
            if isinstance(references[0], list):
                references_spacy = [
                    nlp(" ".join(text), disable=disable) for text in references
                ]
            else:
                references_spacy = [
                    nlp(text, disable=disable) for text in references
                ]
        if "summaqa" in metrics or "stats" in metrics:
            if isinstance(articles[0], list):
                input_spacy = [
                    nlp(" ".join(text), disable=disable) for text in articles
                ]
            else:
                input_spacy = [nlp(text, disable=disable) for text in articles]
            if "stats" in metrics:
                input_spacy_stats = [[tok.text for tok in article]
                                     for article in input_spacy]
    if "supert" in metrics or "blanc" in metrics:
        inputs_space = articles
    # =====================================

    # =====================================
    # GET SCORES
    if args.aggregate:
        final_output = dict()
    else:
        final_output = defaultdict(lambda: defaultdict(int))
    #import pdb;pdb.set_trace()
    for metric, metric_cls in metrics_dict.items():
        print(f"Calculating scores for the {metric} metric.")
        try:
            if metric == "rouge":
                output = metric_cls.evaluate_batch(summaries_delimited,
                                                   references_delimited,
                                                   aggregate=args.aggregate)
                # only rouge uses this input so we can delete it
                del references_delimited
                del summaries_delimited
            elif metric in ('bert_score', 'mover_score', 'chrf', 'meteor',
                            'bleu'):
                output = metric_cls.evaluate_batch(summaries_space,
                                                   references_space,
                                                   aggregate=args.aggregate)
            elif metric in ('s3', 'rouge_we', 'cider'):
                output = metric_cls.evaluate_batch(summaries_stemmed,
                                                   references_stemmed,
                                                   aggregate=args.aggregate)
            elif metric == "sms":
                output = metric_cls.evaluate_batch(summaries_spacy,
                                                   references_spacy,
                                                   aggregate=args.aggregate)
            elif metric in ('summaqa', 'stats', 'supert', 'blanc'):
                if metric == "summaqa":
                    output = metric_cls.evaluate_batch(
                        summaries_space, input_spacy, aggregate=args.aggregate)
                elif metric == "stats":
                    output = metric_cls.evaluate_batch(
                        summaries_spacy_stats,
                        input_spacy_stats,
                        aggregate=args.aggregate)
                elif metric in ('supert', 'blanc'):
                    output = metric_cls.evaluate_batch(
                        summaries_space,
                        inputs_space,
                        aggregate=args.aggregate)
            if args.aggregate:
                final_output.update(output)
            else:
                ids = list(range(0, len(ids)))
                for cur_id, cur_output in zip(ids, output):
                    final_output[cur_id].update(cur_output)
        except Exception as e:
            print(e)
            print(f"An error was encountered with the {metric} metric.")
    # =====================================

    # =====================================
    # OUTPUT SCORES
    metrics_str = "_".join(metrics)
    #json_file_end = args.jsonl_file.split("/")[-1]
    json_file_end = args.jsonl_file.replace("/", "_")
    with open(
            f"outputs/{args.output_file}_{json_file_end}_{metrics_str}.jsonl",
            "w") as outputf:
        if args.aggregate:
            json.dump(final_output, outputf)
        else:
            for key, value in final_output.items():
                value["id"] = key
                json.dump(value, outputf)
                outputf.write("\n")
Esempio n. 55
0
def load_references(input_file,
                    sep_doc_id=':',
                    sep_ref_keyphrases=',',
                    normalize_reference=False,
                    language="en",
                    encoding='utf-8'):
    """Load a reference file. Reference file can be either in json format or in
    the SemEval-2010 official format.

    Args:
        input_file (str): path to the reference file.
        sep_doc_id (str): the separator used for doc_id in reference file,
            defaults to ':'.
        sep_ref_keyphrases (str): the separator used for keyphrases in
            reference file, defaults to ','.
        normalize_reference (bool): whether to normalize the reference
            keyphrases using stemming, default to False.
        language (str): language of the input documents (used for computing the
            stems), defaults to 'en' (english).
        encoding (str): file encoding, default to utf-8.
    """

    logging.info('loading reference keyphrases from {}'.format(input_file))

    references = defaultdict(list)

    # open input file
    with codecs.open(input_file, 'r', encoding) as f:

        # load json data
        if input_file.endswith('.json'):
            references = json.load(f)
            for doc_id in references:
                references[doc_id] = [keyphrase for variants in
                                      references[doc_id] for keyphrase in
                                      variants]
        # or load SemEval-2010 file
        else:
            for line in f:
                cols = line.strip().split(sep_doc_id)
                doc_id = cols[0].strip()
                keyphrases = cols[1].strip().split(sep_ref_keyphrases)
                for v in keyphrases:
                    if '+' in v:
                        for s in v.split('+'):
                            references[doc_id].append(s)
                    else:
                        references[doc_id].append(v)

        # normalize reference if needed
        if normalize_reference:

            # initialize stemmer
            stemmer = SnowballStemmer("porter")
            if language != 'en':
                stemmer = SnowballStemmer(ISO_to_language[language],
                                          ignore_stopwords=True)

            for doc_id in references:
                for i, keyphrase in enumerate(references[doc_id]):
                    stems = [stemmer.stem(w) for w in keyphrase.split()]
                    references[doc_id][i] = ' '.join(stems)

    return references
Esempio n. 56
0
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from string import punctuation

# stopwords
nltk.download('stopwords')
stop_words = list(set(stopwords.words('english')))
punc = list(set(punctuation))
stop_words.extend(punc)
stop_words.extend(["'s", "'d", "'m"])
print(stop_words)

for x in combined:
    x = word_tokenize(x)
    stemmer = SnowballStemmer('english')
    x = [(stemmer.stem(i)).lower() for i in x]
    x = [i for i in x if x not in stop_words]
    combined_features.append(x)

# mapping frequencies with words
from gensim import corpora

dictionary = corpora.Dictionary(combined_features)
print(dictionary)

id = []
for x in combined_features:
    temp = [dictionary.token2id[j] for j in x]
    id.append(temp)

# Creating MLP
            "not",
            "h/o",
            "never",
            "none",
            "nor",
            "non",
            "rare",
            "previous",
            "prior",
            "history",
            "denies",
            "negative"]

# Run the stemmer on feature words
snowball_stemmer = SnowballStemmer("english")
stemmed_alcohol = [snowball_stemmer.stem(word) for word in ALCOHOL]
stemmed_alcohol_modifer = [snowball_stemmer.stem(word) for word in ALCOHOL_MODIFIER]
stemmed_alcohol_mental = [snowball_stemmer.stem(word) for word in ALCOHOL_MENTAL]
stemmed_negation = [snowball_stemmer.stem(word) for word in NEGATION]

# MAIN SCRIPT

# Distance to look for feature in certain direction
left_negation = 5
right_negation = 3
left_modifier = 2
right_modifier = 2

# Read file
def alcohol_abuse_classifier(file_name):
    tree=ET.parse(file_name)
Esempio n. 58
0
def get_stem(content):
    stemmer = SnowballStemmer('english')
    for k in range(len(content)):
        content[k] = stemmer.stem(content[k]).encode('utf-8')
Esempio n. 59
0
def tokenize_stem(train_texts):
	tokens = tokenize(train_texts)
	stemmer = SnowballStemmer('english')
	stemmed_tokens = [stemmer.stem(token) for token in tokens]
	return stemmed_tokens
Esempio n. 60
0
def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """

    # init the stemmer
    stemmer = SnowballStemmer('english')

    # Pad the sequence with placeholders
    tokens = [('[START2]', '[START2]'),
              ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'),
                                                          ('[END2]', '[END2]')]
    history = ['[START2]', '[START1]'] + list(history)

    # shift the index with 2, to accommodate the padding
    index += 2

    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
    previob = history[index - 1]
    contains_dash = '-' in word
    contains_dot = '.' in word
    allascii = all([True for c in word if c in string.ascii_lowercase])

    allcaps = word == word.capitalize()
    capitalized = word[0] in string.ascii_uppercase

    prevallcaps = prevword == prevword.capitalize()
    prevcapitalized = prevword[0] in string.ascii_uppercase

    nextallcaps = prevword == prevword.capitalize()
    nextcapitalized = prevword[0] in string.ascii_uppercase

    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
        'all-ascii': allascii,
        'next-word': nextword,
        'next-lemma': stemmer.stem(nextword),
        'next-pos': nextpos,
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
        'prev-word': prevword,
        'prev-lemma': stemmer.stem(prevword),
        'prev-pos': prevpos,
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
        'prev-iob': previob,
        'contains-dash': contains_dash,
        'contains-dot': contains_dot,
        'all-caps': allcaps,
        'capitalized': capitalized,
        'prev-all-caps': prevallcaps,
        'prev-capitalized': prevcapitalized,
        'next-all-caps': nextallcaps,
        'next-capitalized': nextcapitalized,
    }