def get_words_list(dataset):
	'''
	Loading dataset and read contents, use tokenize to get tokens and lemmatize the words.
	'''

	# join the path and file name together
        spam_path = 'data/enron/pre/'+ dataset + '/spam/'
	ham_path = 'data/enron/pre/'+ dataset + '/ham/'
        spam_npl = [i[-1] for i in os.walk(spam_path)][0]
        ham_npl = [i[-1] for i in os.walk(ham_path)][0]

        spam_fl = (open(os.path.join(spam_path, j)).read().lower() for j in spam_npl)
	ham_fl = (open(os.path.join(ham_path, j)).read().lower() for j in ham_npl)

        splitter = re.compile("\\W*")
	english_stops = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	# tokenize the files into words
	spam_wl = [None]*len(spam_npl)
	for i,f in enumerate(spam_fl):
		spam_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
				if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
        
	ham_wl = [None]*len(ham_npl)
	for i,f in enumerate(ham_fl):
		ham_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
				if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]

	return spam_wl, ham_wl
Beispiel #2
0
def tokenize3(text):
	wordnet_lemmatizer = WordNetLemmatizer()
	tokens             = word_tokenize(text)
	tokens             = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens]
	return tokens
def get_clean_text(list_filenames, path_to_file):
    '''
    parameter:
    ----------
    list_filenames: as LST is a list of filename as STR
    path_to_file: as STR is the path to the file containing movie scripts
    --> such that path_to_file/filename.txt is the file to open

    returns:
    --------
    list of list of words (lemmatize, lowercase) in the text (order preserved)
    '''
    wnl = WordNetLemmatizer()
    list_texts_as_words = []
    for filename in list_filenames:
        path_file = path_to_file+"/"+filename+".txt"
        with open(path_file) as f:
            text = f.readlines()
            lines = [line.strip() for line in text if line.strip()]
            string_words = []
            for line in lines:
                words = [wnl.lemmatize(word.lower()) for word in line.split(' ') if wnl.lemmatize(word.lower())]
                string_words += words
        list_texts_as_words.append(string_words)
    return list_texts_as_words
def bow_score(hypothesis_list,text_list):
	wordnet_lemmatizer = WordNetLemmatizer()
	stop_word_list = ['a', 'an', 'the', ',', '.', ';', ':' ]
	i = 0
	while i < len(hypothesis_list):
		if hypothesis_list[i] in stop_word_list:
			del hypothesis_list[i]
			i = i - 1
		i = i  + 1
	if len(hypothesis_list) == 0:
		return 0
	i = 0	
	while i < len(text_list):
		if text_list[i] in stop_word_list:
			del text_list[i]
			i = i - 1
		i = i + 1
	if len(text_list) == 0:
		return 0
	## Stop words removed up until here

	score = 0	
	for word_text in text_list:
		lemma_text = wordnet_lemmatizer.lemmatize(word_text)
		for word_hypothesis in hypothesis_list:
			lemma_hypothesis = wordnet_lemmatizer.lemmatize(word_hypothesis)
			print lemma_hypothesis
			print lemma_text
			score += lexical_compare(lemma_text,lemma_hypothesis)
			print str(score)
	return score
Beispiel #5
0
    def negator(self,wordVec):
        negation = False
        negated_doc = []
        lemmatizer = WordNetLemmatizer()
        for w,p in wordVec:
            w_out = ""
            if (p[:2] == "NN"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.NOUN)
            elif (p[:2] == "JJ"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADJ)
            elif (p[:2] == "VB"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.VERB)
            elif (p[:2] == "RB"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADV)
            if(w_out == "not" or w_out == "n't" ):
                #print "blah"
                negation = not negation
                #rint negation
            elif(w_out in string.punctuation and w_out != ''):

                negation = False
            elif(negation):
                #print negation
                w_out = "NOT_"+w_out
            negated_doc.append((w_out,p))
        #print negated_doc
        return negated_doc
Beispiel #6
0
def createCorpus(data,i, binaryX="False", stopWords=None, lemmatize="False", tfidf= "False", useidf="True"):  # will vectorize BOG using frequency as the parameter and will return the required arrays
    X_train =[]
    X_test=[]
    Y_train=[]
    Y_test=[]

    for key in data:
        if key in i:

            for filename in data[key]:
                text = data[key][filename][0]
                if lemmatize == "True":
                    port =  WordNetLemmatizer()
                    text = " ".join([port.lemmatize(k,"v") for k in text.split()])
                X_test.append(text)
                Y_test.append(data[key][filename][1])
        else:
            for filename in data[key]:
                text = data[key][filename][0]
                if lemmatize == "True":
                    port =  WordNetLemmatizer()
                    text = " ".join([port.lemmatize(k,"v") for k in text.split()])
                X_train.append(text)
                Y_train.append(data[key][filename][1])
    if tfidf == "False":
        vectorizer = CountVectorizer(min_df=1, binary= binaryX, stop_words=stopWords)
        X_train_ans = vectorizer.fit_transform(X_train)
        X_test_ans = vectorizer.transform(X_test)
        return X_train_ans, Y_train, X_test_ans,Y_test
    elif tfidf == "True":
        vectorizer = TfidfVectorizer(min_df=1, use_idf=useidf)
        X_train_ans = vectorizer.fit_transform(X_train)
        X_test_ans = vectorizer.transform(X_test)

        return X_train_ans, Y_train, X_test_ans,Y_test
Beispiel #7
0
    def getBoW(self, instance):
        bowFeatures = {}

        # tokens in the third position
        tokens = instance[3]
        # pos tag
        wordnet_lemmatizer = WordNetLemmatizer()
        tagged = nltk.pos_tag(tokens)
        i = 0
        for tag in tagged:
            if instance[2] == i:
                i +=1
                continue
                #sys.stderr.write('remove target word (%s)\n' % tag[0])
            elif tag[0] in stopwords.words("english"):
                i +=1
                continue
                #sys.stderr.write('stopword (%s)\n' % tag[0])
            elif re.match("N.*", tag[1]):
                bowFeatures['bow(%s)' %  wordnet_lemmatizer.lemmatize(tag[0], pos="n")] = True
            elif re.match("V.*", tag[1]):
                bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="v")] = True
            elif re.match("R.*", tag[1]):
                bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="r")] = True
            elif re.match("J.*", tag[1]):
                bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="a")] = True
            i += 1
        return bowFeatures
Beispiel #8
0
class TweetsLemmatizedVectorizer(TweetsTruncatedVectorizer):
  def __init__(self):
    self.vectorizer = TfidfVectorizer(stop_words='english',min_df=5) #, sublinear_tf=True)
    self.wordnet = WordNetLemmatizer()

  def fit_transform(self, users):
    join_tweets = []
    
    for user in users:
      timeline = [''.join(remove_tweet_noise(tweet.text)) for tweet in user.twitter]
      #timeline_insta = [''.join(remove_tweet_noise(insta.text)) for insta in user.instagram]
      #print timeline_insta
      #timeline = timeline + timeline_insta
      lemmatized = []
      for tweet in timeline:
        lemma = [self.wordnet.lemmatize(word) for word in tweet.split()]
        lemmatized.append(' '.join(lemma))
      
      join_tweets.append(''.join(lemmatized))

    return self.vectorizer.fit_transform([usertweets for usertweets in join_tweets])

  def transform(self, users):
    join_tweets = []
    
    for user in users:
      timeline = [''.join(remove_tweet_noise(tweet.text)) for tweet in user.twitter]
      lemmatized = []
      for tweet in timeline:
        lemma = [self.wordnet.lemmatize(word) for word in tweet.split()]
        lemmatized.append(' '.join(lemma))
      
      join_tweets.append(''.join(lemmatized))

    return self.vectorizer.transform([usertweets for usertweets in join_tweets])
def possibility():
    wnl = WordNetLemmatizer()
    verb = wnl.lemmatize(verbs[random.randrange(0, len(verbs))])
    noun = wnl.lemmatize(nouns[random.randrange(0, len(nouns))])

    article = "a"
    if noun[0] in ["a", "e", "i", "o", "u"]:
        article = "an"

    if random.randrange(0, 100) < chance_quantity:
        quantity_word = quantity_adverbs[random.randrange(0, len(quantity_adverbs))]
        if not noun.endswith("s") and not noun.endswith("y") and not quantity_word == "numerous":
            noun += "s"
        possibility = verb + " " + quantity_word + " of the " + noun

    elif random.randrange(0, 100) < chance_location:
        location_word = location_adverbs[random.randrange(0, len(location_adverbs))]
        possibility = (
            verb
            + " "
            + article
            + " "
            + noun
            + " "
            + location_word
            + " the "
            + wnl.lemmatize(nouns[random.randrange(0, len(nouns))])
        )

    else:
        possibility = verb + " " + article + " " + noun

    return possibility
Beispiel #10
0
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        doc = doc.lower()
        doc = re.sub("[^a-z]", " ", doc) #replace punctuation with spaces
        # doc = re.sub("thanks", "thank", doc)
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if len(self.wnl.lemmatize(t)) > 2]
 def build_analyzer(self):
     try:
         english_lemmatizer = WordNetLemmatizer()
         analyzer = super(ProcessCountVectorizer, self).build_analyzer()
         return lambda doc: (english_lemmatizer.lemmatize(english_lemmatizer.lemmatize(w, "v"), "n")
                             for w in analyzer(doc) if not w.endswith("ly") and len(w) > 4)
     except Warning:
         pass
Beispiel #12
0
def pos_analysis(tags, stoplist):
    wordnet_lemmatizer = WordNetLemmatizer()
    nouns = [wordnet_lemmatizer.lemmatize(word) for word, tag in tags if tag=='NN']
    display_freq(nouns, 'Nouns', top=50)
    adjectives = [wordnet_lemmatizer.lemmatize(word) for word, tag in tags if tag=='JJ']
    display_freq(adjectives, 'Adjectives', top=50)
    verbs = [wordnet_lemmatizer.lemmatize(word, pos='v') for word, tag in tags if tag[:2] in ('VB') and word not in stoplist]
    display_freq(verbs, 'Verbs', top=50)
Beispiel #13
0
def stopWordRemoval() :


	f = open('repos', 'r')
	strn = f.read()
	lst = strn.split('\n')

	i = 0
	while i < (len(lst) - 1) :
	
		name = lst[i].split("/")

		dummyFile = 'filteredData/' + name[1] + '/dummy.txt';
		dr = os.path.dirname(dummyFile)

		if not os.path.exists(dr) :
			os.makedirs(dr)

		ft = open('data/'+name[1]+'/title.txt')
		st = ft.read().lower()

		fd = open('data/'+name[1]+'/description.txt')
		sd = fd.read().lower()

		fc = open('data/'+name[1]+'/content.txt')
		sc = fc.read().lower()
		

		tokenizer = RegexpTokenizer(r'\w+')

		wordArrTitle = tokenizer.tokenize(st)
		wordArrDesc = tokenizer.tokenize(sd)
		wordArrData = tokenizer.tokenize(sc)

		filteredWordsTitle = [w for w in wordArrTitle if not w in stopwords.words('english')]
		filteredWordsDesc = [w for w in wordArrDesc if not w in stopwords.words('english')]
		filteredWordsData = [w for w in wordArrData if not w in stopwords.words('english')]

		wordnet_lem= WordNetLemmatizer()


		ftf = open('filteredData/'+name[1]+'/title.lst','w')
		for w in filteredWordsTitle:
			#print w
			ftf.write(wordnet_lem.lemmatize(w)+'\n')

		fdf = open('filteredData/'+name[1]+'/description.lst','w')
		for w in filteredWordsDesc:
			#print w
			fdf.write(wordnet_lem.lemmatize(w)+'\n')

		fcf = open('filteredData/'+name[1]+'/content.lst','w')
		for w in filteredWordsData:
			print w+'\n'
			fcf.write(wordnet_lem.lemmatize(w)+'\n')
		
		i=i+2
Beispiel #14
0
def tokenize4(text):
	wordnet_lemmatizer = WordNetLemmatizer()
	tokens             = word_tokenize(text)
	wordset            = set(words.words())
	tokens             = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens]
	tokens             = [token for token in tokens if token in wordset]
	return tokens
 def __init__(self, data, label=None, *args, **kwargs):
     lem = WordNetLemmatizer()
     if data and not label:
         # Data is assumed to be NLTK-style (word, tag) pairs.
         # If you'd like to collapse the tag set, this is the place.
         label = [re.sub(r'[{}]+'.format(punctuation),'PUN',tag) for word, tag in data] # e.g., tag[0]
         data = [re.sub(r'[{}]+'.format(punctuation),'PUN', lem.lemmatize(word.lower())) for word, tag in data]
         data = [re.sub(r'[0-9]+','NUM', lem.lemmatize(word.lower())) for word in data]
     super(TaggedSentence, self).__init__(data, label, *args, **kwargs)
Beispiel #16
0
	def preprocessing(text):
		lemmatizer = WordNetLemmatizer()
		worddict = set(nltk.corpus.words.words())
		text = text.lower()
		words = text.strip().decode('utf-8')
		wordset_n = set(lemmatizer.lemmatize(w, NOUN) for w in word_tokenize(words))
		wordset_v = set(lemmatizer.lemmatize(w, VERB) for w in wordset_n)
		wordset = set(lemmatizer.lemmatize(w, ADJ) for w in wordset_v)
		wordset = wordset & worddict
		return ' '.join(list(wordset))
Beispiel #17
0
def lemmatize(word, mode):
    try:
        wnl = WordNetLemmatizer()
        if mode=='n' :
            return wnl.lemmatize(word, 'n')
        elif mode=='v' :
            return wnl.lemmatize(word, 'v')
        return word
    except:
        print "WordNet Lemmatizer failed."
        return ''
def read_data():
    
    ''' This function reads the data from deals.txt and performs all 
    pre-prosessing. It removes punctuations, stop words and lematizes
    the words. Also null lines in the file are removed'''
    
    good_deals_file = os.path.join(data_dir,"good_deals.txt")
    bad_deals_file = os.path.join(data_dir,"bad_deals.txt")   
    stop_words_file = os.path.join(data_dir,"stop_words.txt")
    test_deals_file = os.path.join(data_dir,"test_deals.txt")
    f_stop_words = open(stop_words_file,'r')
    stop_words = [word.strip() for word in f_stop_words]
    stop_words.append("com")
    stop_words = set(stop_words)
    
    
    wnl = WordNetLemmatizer()
    start_time = time.time() 
    f = open(good_deals_file,'r')
    good_deals = []     
    for line in f:
        deal = re.findall(r"[\w']+|[!?;]%", line)
        '''removes stop words'''
        deal = " ".join(word for word in 
                        deal if word not in stop_words)
        deal =  " ".join((wnl.lemmatize(word)) for word in deal.split())
        good_deals.append(deal)
    f.close()
    
    f = open(bad_deals_file,'r')
    bad_deals = []     
    for line in f:
        deal = re.findall(r"[\w']+|[%!?;]", line)
        '''removes stop words'''
        deal = " ".join(word for word in 
                        deal if word not in stop_words)
        deal =  " ".join((wnl.lemmatize(word)) for word in deal.split())
        bad_deals.append(deal)
        
    f.close()
    
    f = open(test_deals_file,'r')
    test_deals = []     
    for line in f:
        deal = re.findall(r"[\w']+|[!?;]%", line)
        '''removes stop words'''
        deal = " ".join(word for word in 
                        deal if word not in stop_words)
        deal =  " ".join((wnl.lemmatize(word)) for word in deal.split())
        test_deals.append(deal)
    f.close()
    
    
    return [good_deals,bad_deals,test_deals]
    def stem_wordnet(self, word):
        wnl = WordNetLemmatizer()

        # obtain the word class
        tag = nltk.pos_tag(nltk.word_tokenize(word))

        # word class for verb can be different, but the first two letters must be "VB"
        if len(tag[0][1]) >= 2 and (tag[0][1])[0:2] == 'VB':
            return wnl.lemmatize(word, 'v')

        else:
            return wnl.lemmatize(word)
def find_replacements(sentence, lwindow, rwindow, add=False):
    """
    This function would be used to find replacements for the word present
    inside the sentence.

    @sentence: Actual sentence in which word is present.
    @lwindow : Number of context words in the left of the replacement.
    @rwindow : Number of context words in the right of the replacement.
    @add     : Whether we are going to add the vectors. 
               Otherwise default to multiply.

    """
    # Remove the START and END temporarily and tag the data.
    word       = sentence[sentence.index('_START_') + 7 : sentence.index('_END_')]
    word_index = nltk.word_tokenize(sentence).index("_START_" + word + "_END_")
    t_sentence = sentence[:sentence.index('_START_')] + word + sentence[sentence.index('_END_') + 5:]

    # Tag the sentence and then bring the START and END back.
    tagged_sentence = nltk.pos_tag(nltk.word_tokenize(t_sentence))
    #print sentence, tagged_sentence

    wnl = WordNetLemmatizer()
    word_postag = get_wordnet_pos(tagged_sentence[word_index][1])
    if word_postag:
        word = wnl.lemmatize(word, pos=word_postag)
    tagged_sentence[word_index] = ["_START_" + word + "_END_", tagged_sentence[word_index][1]]
    
    # Remove all the words, whose tags are not important and also
    # get rid of smaller words.
    imp_words = filter(lambda x: len(x[0]) > 2, get_imp_words(tagged_sentence))
    #print imp_words

    final_list = []
    for i, x in enumerate(imp_words):
        if x[0].startswith("_START_"):
            index = i
            x[0] = x[0][7:x[0].index("_END_")]
            final_list.append("_START_" + x[0].lower() + "_" + x[1][0].lower() + "_END_")
            word = word.lower() #+ "_" + x[1][0].lower()
            #print word
        else:
            # Lemmatize all the words.
            word_postag = get_wordnet_pos(x[1])
            temp = x[0]
            if word_postag:
                temp = wnl.lemmatize(x[0], pos=word_postag)
            final_list.append(temp.lower()) # + "_" + x[1][0].lower())

    try:
        return find_replacements_helper(final_list, word, index, int(lwindow), int(rwindow) + 1, add)
    except Exception:
        return "NONE"
Beispiel #21
0
def events_filter(title, lang):
    #Cleans, tokenizes and lemmatizes news title to save keyowrds.
    #This way, words are saved in their dictionary form.
    #With this we have a standard way of representing an event.

    f_tags = []

    #Regex adapted from nltk documentation
    pattern = (
        r"(?x)"      # set flag to allow verbose regexps
        r"(?:[A-Z])(?:\.[A-Z])+\.?"  # abbreviations, e.g. U.S.A.
        r"|\w+(?:-\w+)*"            # words with optional internal hyphens
        r"|\$?\d+(?:\.\d+)?%?"      # currency and percentages, e.g. $12.40, 82%
        )

    #Tokenize title acording to the regex pattern.
    tokens = nltk.regexp_tokenize(title, pattern)

    #Remove stopwords. Lang should be either 'english' or 'spanish'.
    tokens = [w.lower() for w in tokens if w.lower() not in sw.words(lang)]

    if lang == "english":
        #Lemmatization for english.
        wnl = WordNetLemmatizer()

        #Tag words (noun, adjective, verb or adverb). Makes lemmatization more accurate.
        pos_toks = nltk.pos_tag(tokens)

        #Transform pos_tag in tag that lemmatize understand.
        wordnet_tag = {
            'NN':'n', 'NNS':'n', 'NNP':'n',
            'NNPS':'n', 'JJ':'a', 'JJR':'a',
            'JJS':'a', 'VB':'v', 'VBD':'v',
            'VBG':'v', 'VBN':'v', 'VBP':'v',
            'VBZ':'v', 'RB':'r', 'RBR':'r', 'RBS':'r'}

        #Lemmatization, with pos tags.
        for i in range(len(tokens)):
            pos_tok = pos_toks[i]
            if pos_tok[1] in wordnet_tag.keys():
                tokens[i] = wnl.lemmatize(tokens[i], wordnet_tag[pos_tok[1]])
            else:
                tokens[i] = wnl.lemmatize(tokens[i])
    elif lang == "spanish":
        #Lemmatization for spanish, using a dictionary.
        for i in range(len(tokens)):
            if tokens[i] in SP_LEMMAS.keys():
                tokens[i] = SP_LEMMAS[tokens[i]]
            #else: word not in dictionary, save token unchanged.
    for tok in tokens:
        f_tags.append(clean_word(tok))
    return f_tags
Beispiel #22
0
 def __wn_lemmatize(self, lemma):
     """
     Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always
     returns a (string, pos) pair.  Lemmatizes even when the tag
     isn't helpful, by ignoring it for stemming.
     """
     string, tag = lemma
     wnl = WordNetLemmatizer()
     if tag in ('a', 'n', 'r', 'v'):
         string = wnl.lemmatize(string, tag)
     else:
         string = wnl.lemmatize(string)
     return (string, tag)
    def __lemmatizeTuples(self, tuples):
        lmt = WordNetLemmatizer();

        # Multiple tuples
        if isinstance(tuples, list):
            lemmaTuples = [(' '.join([lmt.lemmatize(c.lower(), 'v') for c in j[0].split(' ')]), \
                            ' '.join([lmt.lemmatize(c.lower(), 'n') for c in j[1].split(' ')]), \
                            ' '.join([lmt.lemmatize(c.lower(), 'n') for c in j[2].split(' ')])) \
                            for j in tuples];
        else:
            lemmaTuples = (' '.join([lmt.lemmatize(c.lower(), 'v') for c in tuples[0].split(' ')]), \
                            ' '.join([lmt.lemmatize(c.lower(), 'n') for c in tuples[1].split(' ')]), \
                            ' '.join([lmt.lemmatize(c.lower(), 'n') for c in tuples[2].split(' ')]));
        return lemmaTuples
Beispiel #24
0
def dogify(inp):
  wnl = WordNetLemmatizer()
  l = nltk.word_tokenize(inp)
  l1, l2, l3 = [], [], []

  for i, j, in nltk.pos_tag(l):
    if len(i) < 4: continue
    if j == "NN":
      l1.append(i)
    elif j == "JJ":
      l2.append(i)
    elif j.find("VB") != -1:
      l3.append(i)

  def rnd(x):
    return random.randint(0, x-1)

  def go(l):
    l.sort()
    ret = [""]
    cnt = 0
    bst = 0
    prv = ""
    for i in l:
      if i != prv: 
        cnt = 0
      cnt += 1
      if cnt > bst:
        bst = cnt
        ret = []
      if cnt == bst:
        ret.append(i)
      prv = i
    x = rnd(len(ret))
    return ret[x]

  noun = wnl.lemmatize(go(l1))
  adj = wnl.lemmatize(go(l2))
  verb = wnl.lemmatize(go(l3))

  s = ""
  if len(noun):
    s += "so " + noun + "\n"
  if len(adj):
    s += "much " + adj + "\n"
  if len(verb):
    s +=  "very " + verb + "\n"
  s += "wow"
  return s
 def lemmatize_text_as_list(self, text_alpha_no_punct_stopword_list):
     lemmatizer = WordNetLemmatizer()
     lemmatized_list_by_verb = []
     lemmatized_list_by_verb_noun = []
     lemmatized_list_by_verb_noun_adj = []
     lemmatized_list_by_verb_noun_adj_adv = []
     for i in text_alpha_no_punct_stopword_list:
         lemmatized_list_by_verb.append(lemmatizer.lemmatize(i, pos='v'))
     for i in lemmatized_list_by_verb:
         lemmatized_list_by_verb_noun.append(lemmatizer.lemmatize(i, pos='n'))
     for i in lemmatized_list_by_verb_noun:
         lemmatized_list_by_verb_noun_adj.append(lemmatizer.lemmatize(i, pos='a'))
     for i in lemmatized_list_by_verb_noun_adj:
         lemmatized_list_by_verb_noun_adj_adv.append(lemmatizer.lemmatize(i, pos='r'))
     return lemmatized_list_by_verb_noun_adj_adv
Beispiel #26
0
def tokenize5(text):
	wordnet_lemmatizer = WordNetLemmatizer()
	translate_table = dict((ord(char), None) for char in string.punctuation)
	if type(text) == str:
		tokens = word_tokenize(text.translate(None, string.punctuation)) # remove punctuation
		tokens = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens]
		tokens = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens]
		tokens = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens]
		return tokens
	elif type(text) == unicode:
		tokens = word_tokenize(text.translate(translate_table))
		tokens = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens]
		tokens = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens]
		tokens = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens]
		return tokens
def generate_captions_and_comments():
	
	with open('./data/big_data_approx.json') as json_file:   
		video_data = json.load(json_file)
		
	video_num_comments, video_captions = np.array([ (video_datum["score"], video_datum["captions"]) 
                                              for _,video_datum in video_data.iteritems() ]).T
												
	# Define a stemmer and lemmatizer for use with our captions
	stemmer = PorterStemmer()
	lemmatizer = WordNetLemmatizer()


	combined_video_captions = []
	video_num_comments_cut  = []
	for caption_data_list,num_comments in zip(video_captions,video_num_comments):
		text = ""
		if caption_data_list is not None:
			video_num_comments_cut.append(num_comments)
			for caption_data in caption_data_list:
				if caption_data is not None and "text" in caption_data:
					for word in caption_data["text"].split():
						#text += (stemmer.stem(word)+" ")
						text += (lemmatizer.lemmatize(word)+" ")
			combined_video_captions.append(text[:-1])
		
	video_captions = combined_video_captions
	
	return (video_num_comments_cut, video_captions)
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
    supported_stemmers = [
        "PorterStemmer", "SnowballStemmer",
        "LancasterStemmer", "WordNetLemmatizer"]
    if type is False or type not in supported_stemmers:
        return words_l
    else:
        l = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "WordNetLemmatizer":  # TODO: context
            wnl = WordNetLemmatizer()
            for word in words_l:
                l.append(wnl.lemmatize(word).encode(encoding))
        return l
class Mapper(object):

    def __init__(self):
        if 'stopwords' in self.params:
            with open(self.params['stopwords'], 'r') as excludes:
                self._stopwords = set(line.strip() for line in excludes)
        else:
            self._stopwords = None

        self.lemmatizer = WordNetLemmatizer()

    def __call__(self, key, value):
        for word in self.tokenize(value):
            if not word in self.stopwords:
                yield word, 1

    def normalize(self, word):
        word = word.lower()
        return self.lemmatizer.lemmatize(word)

    def tokenize(self, sentence):
        for word in wordpunct_tokenize(sentence):
            yield self.normalize(word)

    @property
    def stopwords(self):
        if not self._stopwords:
            self._stopwords = nltk.corpus.stopwords.words('english')
        return self._stopwords
def search_posts(phrase, engine):
    lemmatizer = WordNetLemmatizer()
    words = ["(^|[^a-z])" + lemmatizer.lemmatize(word)
                for word in word_tokenize(phrase)
                    if word not in stopwords.words('english')
                    and len(word) >= 3]

    if len(words) == 0:
        return None

    params = {'phrase': "|".join(words)}
    query = ["SELECT link_id, url, title FROM threads", 
             "WHERE title_lower ~ %(phrase)s"]
    found = pd.read_sql(" ".join(query), 
                       engine, 
                       params=params)
    
    if len(found['link_id']) == 0: 
        return None 

    link_ids = ', '.join(found['link_id'].apply(lambda lid: "'" + lid + "'"))
    query = ["SELECT clean_body as body, affil, link_id FROM cleaned", 
             "WHERE link_id IN (" + link_ids + ")"]
    data = pd.read_sql(" ".join(query), engine)
    
    valid = data[data['body'].apply(lambda text: len(text.split()) >= 10 
                                 and not bool(re.search("[^a-z]bot[^a-z]", text)))]
    
    if valid.shape[0] < 60: 
        return None
    
    return valid, found.set_index('link_id')
Beispiel #31
0
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    # print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))
    '''
    In the above output, you must be wondering that no actual root form has been given for any word, this is because
     they are given without context. You need to provide the context in which you want to lemmatize that is the parts-of-speech (POS).
     This is done by giving the value for pos parameter in wordnet_lemmatizer.lemmatize
    '''
    print("{0:20}{1:20}".format(word,
                                wordnet_lemmatizer.lemmatize(word, pos='v')))
print(wordnet_lemmatizer.lemmatize("was", pos='v'))
class Preprocess:

    def __init__(self, text):

        self.text = text
        self.STOPWORDS = set(stopwords.words('english'))
        self.spell = SpellChecker()
        self.p = inflect.engine()
        self.nlp = en_core_web_sm.load()
        #self.nlp = spacy.load('en_core_web_md')
        self.model = api.load("glove-twitter-25")
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()

    
    def strip_html_tags(self):

        """remove html tags from text"""
        soup = BeautifulSoup(self.text, "html.parser")
        stripped_text = soup.get_text(separator=" ")
        return stripped_text
    

    def remove_accented_chars(self):

        """remove accented characters from text, e.g. café"""
        text = unidecode.unidecode(self.text)
        return text

    
    '''def expand_contractions(self, text):
        """expand shortened words, e.g. don't to do not"""
        text = list(cont.expand_texts([text], precise=True))[0]
        return text'''
    

    def pos_tagging(self): 

        word_tokens = word_tokenize(self.text) 
        return pos_tag(word_tokens)

    
    def text_lowercase(self): 

        return self.text.lower()


    def text_uppercase(self): 

        return self.text.upper()

    
    def remove_numbers(self): 

        result = re.sub(r'\d+', '', self.text) 
        return result

    
    def convert_number(self): 

        # split string into list of words 
        temp_str = self.text.split() 
        # initialise empty list 
        new_string = [] 
    
        for word in temp_str: 
            # if word is a digit, convert the digit 
            # to numbers and append into the new_string list 
            if word.isdigit(): 
                temp = p.number_to_words(word) 
                new_string.append(temp) 
    
            # append the word as it is 
            else: 
                new_string.append(word) 
    
        # join the words of new_string to form a string 
        temp_str = ' '.join(new_string) 
        return temp_str
    

    def remove_punctuation(self): 

        translator = str.maketrans('', '', string.punctuation) 
        return self.text.translate(translator)

    
    def remove_whitespace(self): 

        return  " ".join(self.text.split()) 

    
    def remove_stopwords(self):

        """custom function to remove the stopwords"""
        return " ".join([word for word in str(self.text).split() if word not in self.STOPWORDS])

    
    def stem_words(self):

        return " ".join([self.stemmer.stem(word) for word in self.text.split()])


    def lemmatize_words(self):

        return " ".join([self.lemmatizer.lemmatize(word) for word in self.text.split()])

    
    def remove_freqwords(self, df, column_name):

        """custom function to remove the frequent words"""

        cnt = Counter()

        for self.text in df["text_wo_stop"].values:
            for word in self.text.split():
                cnt[word] += 1
        FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])

        return " ".join([word for word in str(self.text).split() if word not in FREQWORDS])

    
    def remove_emoji(self):

        emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', self.text)

    
    def remove_emoticons(self):

        emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
        return emoticon_pattern.sub(r'', self.text)

    
    def convert_emoticons(self):

        for emot in EMOTICONS:
            text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), self.text)
        return text


    def remove_urls(self):

        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', self.text)

    
    def remove_html(self):

        html_pattern = re.compile('<.*?>')
        return html_pattern.sub(r'', self.text)

    
    def correct_spellings(self):

        corrected_text = []
        misspelled_words = self.spell.unknown(self.text.split())

        for word in self.text.split():
            if word in misspelled_words:
                corrected_text.append(self.spell.correction(word))
            else:
                corrected_text.append(word)

        return " ".join(corrected_text)


    def NER(self):

        doc = self.nlp(self.text)
        entity_label_map = dict()

        for entity in doc.ents:
            entity_label_map[entity.self.text] = entity.label_
        
        return entity_label_map
def get_lemmatized_text(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]
Beispiel #34
0
from nltk.stem import WordNetLemmatizer
'''
    lemmatizing - better than stemming as it gives actual word with meaning.
    can also club lot of words together.
'''
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('cats'))
print(lemmatizer.lemmatize('better'))
print(lemmatizer.lemmatize('best', pos='a'))
lemma = WordNetLemmatizer()
news = pd.read_csv(r" news.csv")
data=news.drop(['Unnamed: 0'],axis=1)
TEXTdata=[]
TITLEdata=[]
for i in range(len(news)):
 data['text'].iloc[i] = re.sub('[^a-zAZ]',' ',data['text'].iloc[i]).lower()
 data['title'].iloc[i] = re.sub('[^a-zAZ]',' ',data['title'].iloc[i]).lower()

 textword = word_tokenize(data['text'].iloc[i])
 titleword = word_tokenize(data['title'].iloc[i])
 text=""
 title=""
 for w in textword:
 if w not in stop_words:
 wr = lemma.lemmatize(w)
 text=text+" "+wr
 for k in titleword:
 if k not in stop_words:
 kr = lemma.lemmatize(k)
 title=title+" "+kr
 TEXTdata.append(text)
 TITLEdata.append(title)
 
#Vectorisation of data to produce training data and labels
Y=[]
for i in range(len(data)):
 if data['label'].iloc[i] == 'FAKE':
 Y.append(1)
 elif data['label'].iloc[i] == 'REAL':
 Y.append(0)
Beispiel #36
0
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
import pickle
import enchant
from torch.autograd import Variable

from string import digits

#### Please download nltk resources before using this file. 
nltk.download("stopwords")
print(stopwords.words('english'))
nltk.download('wordnet')
wordnet_lematizer = WordNetLemmatizer()
print(wordnet_lematizer.lemmatize('good'))
nltk.download('averaged_perceptron_tagger')
print(nltk.pos_tag(['do','yes']))

class Myarticles(data.Dataset):
    def __init__(self, csvfile_path, txt_folder_path, glove_path='/Users/duanyiqun/Downloads/Textcls/glove.6B', validation=False):
        self.glove_path =glove_path
        #self.glove_init()
        #self.w2v = self.init_word2vec()
        self.articleor = self.Creat_article_list(csvfile_path)
        if validation:
            self.articles = self.articleor[200:250]
        else:
            self.articles = self.articleor[0:200]
        #self.articles = self.articleor[0:10]
        self.folderpath = txt_folder_path
text = [i.lower() for i in text]  # To convert into lower case
stop = set(stopwords.words('english'))
text = [word_tokenize(i) for i in text]
tweets = []
text1 = []
for i in text:
    i = [w for w in i if not w in stop]  # To remove stop words
    i = [w for w in i
         if not re.search(r'^-?[0-9]+(.[0-9]+)?$', w)]  # To remove numbers
    text1.append(i)
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_token = []
for sent in text1:  # Lemmatization to convert tokens to canonical form
    tweets = []
    for token in sent:
        token = wordnet_lemmatizer.lemmatize(token)
        token = wordnet_lemmatizer.lemmatize(token, pos='v')
        tweets.append(token)
    lemmatized_token.append(tweets)

##########################################################################
# COMMENTED INTENTIONALLY                                                #
# Code to find top 100 words as per frequency to check for unwanted and  #
# redundant words. Added stop words based on this analysis.              #
##########################################################################
#from collections import Counter
#count=[]
#for i in lemmatized_token:
#    for j in i:
#        count.append(j)
#count
Beispiel #38
0
            outputFile = outputFile.replace('\n', ' ')

            outputFile = regex.sub("'", "", outputFile)
            outputFile = regex.split('\W+', outputFile)

            training_data_lst.append({'doc': outputFile, 'class': class_name})
            classes_quan[training_data_lst[len(training_data_lst) - 1]
                         ['class']] = classes_quan[training_data_lst[
                             len(training_data_lst) - 1]['class']] + 1

            for j in range(
                    0,
                    len(training_data_lst[len(training_data_lst) - 1]['doc'])):
                training_data_lst[len(training_data_lst) -
                                  1]['doc'][j] = lemmatizer.lemmatize(
                                      training_data_lst[len(training_data_lst)
                                                        - 1]['doc'][j].lower())
                if (not (training_data_lst[len(training_data_lst) -
                                           1]['doc'][j] in stopWord)
                    ) and (not (
                        training_data_lst[len(training_data_lst) - 1]['doc'][j]
                        in tokens_document)) and (
                            len(training_data_lst[len(training_data_lst) -
                                                  1]['doc'][j]) > 1
                        ):  # removes words which doesn't exist in stopword and distinct token list
                    tokens_document.append(
                        training_data_lst[len(training_data_lst) -
                                          1]['doc'][j])

            doc = list(
                set(training_data_lst[len(training_data_lst) - 1]['doc']))
    
from nltk.stem import WordNetLemmatizer

tester = 1
lemmatizer = WordNetLemmatizer()
documents = df_refined

# Tokenize words
from nltk.tokenize import word_tokenize
from nltk import download
download('punkt')

documents_tokenized = [word_tokenize(document) for document in documents]

# lemmattizing tokens (better than stemming by taking word context into account)
documents_tokenized_lemmatized = [[lemmatizer.lemmatize(token) for token in text] 
                                                    for text in documents_tokenized]

from nltk.sentiment.util import mark_negation

documents_tokenized_lemmatized_negated = [mark_negation(document) for document in documents_tokenized_lemmatized]

ready_corpus=documents_tokenized_lemmatized_negated

download('opinion_lexicon')
from nltk.corpus import opinion_lexicon

# we consider only sentiment words, opinion_lexicon icludes already mispelled sentiment words,
# so we did not use the enchant library this time.
sentiment_words= opinion_lexicon.words()
sentiment_words_negated= [word+'_NEG' for word in sentiment_words]
Beispiel #40
0
# In[4]:

#removing stop words from the list of words
no_stop_words1 = [word for word in words1 if word not in stop_words]
no_stop_words2 = [word for word in words2 if word not in stop_words]
no_stop_words3 = [word for word in words3 if word not in stop_words]
no_stop_words4 = [word for word in words4 if word not in stop_words]

# In[5]:

#Choose to lemmetize as lemmetizing does not only cut off the end part but reduces to base form
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized_words1 = [lemmatizer.lemmatize(x) for x in no_stop_words1]
lemmatized_words2 = [lemmatizer.lemmatize(x) for x in no_stop_words2]
lemmatized_words3 = [lemmatizer.lemmatize(x) for x in no_stop_words3]
lemmatized_words4 = [lemmatizer.lemmatize(x) for x in no_stop_words4]

# In[6]:

from nltk.stem.porter import *
stemmer = PorterStemmer()

stemmed_words1 = [stemmer.stem(x) for x in lemmatized_words1]
stemmed_words2 = [stemmer.stem(x) for x in lemmatized_words2]
stemmed_words3 = [stemmer.stem(x) for x in lemmatized_words3]
stemmed_words4 = [stemmer.stem(x) for x in lemmatized_words4]

# In[7]:
print(POS_tag)

# nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

adjective_tags = ['JJ', 'JJR', 'JJS']

lemmatized_text = []

for word in POS_tag:
    if word[1] in adjective_tags:
        lemmatized_text.append(
            str(wordnet_lemmatizer.lemmatize(word[0], pos="a")))
    else:
        lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(
            word[0])))  # default POS = noun

print("Text tokens after lemmatization of adjectives and nouns: \n")
print(lemmatized_text)

POS_tag = nltk.pos_tag(lemmatized_text)

print("Lemmatized text with POS tags: \n")
print(POS_tag)

stopwords = []

wanted_POS = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS', 'VBG', 'FW']
Beispiel #42
0
def tokenize(text):
    '''process the text into cleaned tokens

    The text is processed by removing links,emails, ips,
    keeping only alphabet a-z in lower case, then
    test split into individual tokens, stop word is removed,
    and words lemmatized to their original stem

    Args:
      text (str): a message in text form

    Returns:
      clean_tokens (array): array of words after processing
    '''

    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    emails_regex = '[a-zA-Z0-9+_\-\.]+@[0-9a-zA-Z][.-0-9a-zA-Z]*.[a-zA-Z]+'
    ips_regex = '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})'
    stopword_list = stopwords.words('english')
    placeholder_list = ['urlplaceholder', 'emailplaceholder', 'ipplaceholder']

    # Remove extra paranthesis for better URL detection
    text = text.replace("(", "")
    text = text.replace(")", "")

    # get list of all urls/emails/ips using regex
    detected_urls = re.findall(url_regex, text)
    detected_emails = re.findall(emails_regex, text)
    # remove white spaces detected ar end of some urls
    detected_emails = [email.split()[0] for email in detected_emails]
    detected_ips = re.findall(ips_regex, text)

    # Remove numbers and special characters, help down vocab size
    pattern = re.compile(r'[^a-zA-Z]')
    stopword_list = stopwords.words('english')

    for url in detected_urls:
        text = re.sub(url, 'urlplaceholder', text)
    for email in detected_emails:
        text = re.sub(email, 'emailplaceholder', text)
    for ip in detected_ips:
        text = re.sub(ip, 'ipplaceholder', text)
    for stop_word in stopword_list:
        if (stop_word in text):
            text.replace(stop_word, '')

    # remove everything except letetrs
    text = re.sub(pattern, ' ', text)
    # initilize
    tokens = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        if ((tok not in stopword_list) and (tok not in placeholder_list)
                and len(tok) > 2):
            clean_tok = lemmatizer.lemmatize(lemmatizer.lemmatize(tok.strip()),
                                             pos='v')
            # Remove Stemmer for better word recognition in app
            #clean_tok = PorterStemmer().stem(clean_tok)
            clean_tokens.append(clean_tok)

    return clean_tokens
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
stemmer_output = PorterStemmer()
print(stemmer_output.stem('happiness'))
lemmatizer_output = WordNetLemmatizer()
print(lemmatizer_output.lemmatize('happiness'))
Beispiel #44
0
# replace internal whitespace with underscores,
# remove all non-alphabet characters (numbers, punctuation)
cleaned_tags = [
    re.sub(r"\s+", '_',
           re.sub("[^a-zA-z\s+]+", '', t.lower()).strip()) for t in tags
]
tag_df['Cleaned_Tag'] = cleaned_tags
print len(set(cleaned_tags))
# 9151 post-cleaning

# # Stemming -- computationally quicker, but lemma
# # is preferable due to higher level of sophistication.
# porter_stemmer = PorterStemmer()

# stemmed_tags = [porter_stemmer.stem(t)
#                 for t in cleaned_tags]
# tag_df['Stemmed_Tag'] = stemmed_tags
# print len(set(stemmed_tags))
# # 7517 post-stemming

# Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
lemma_tags = [wordnet_lemmatizer.lemmatize(t) for t in cleaned_tags]
tag_df['Lemmatized_Tag'] = lemma_tags
print len(set(lemma_tags))
# 8455 unique lemmatized_tags
print len(set([t for t in lemma_tags if '_' in t]))
# 1181 of which are multi_word

with open('data/cleaned_tags.pickle', 'wb') as f:
    pickle.dump(tag_df, f)
Beispiel #45
0
def lemma(word, part):
    n = WordNetLemmatizer()

    return n.lemmatize(word, part)
Beispiel #46
0
df = pd.read_csv(file)
ratings = df.Rating
headers = df.Header
reviews = df.Review
products = df.Product

new_headers = list()
new_reviews = list()

print("Checking headers")
for header in headers:
    if type(header) != float:
        new_header = list()
        words = tokenizer.tokenize(header)
        for word in words:
            lemma = lmtzr.lemmatize(word.lower())
            new_header.append(lemma)
        new_headers.append(' '.join(new_header))
    else:
        new_headers.append('')

print("Checking reviews")
for review in reviews:
    if type(review) != float:
        new_review = list()
        words = tokenizer.tokenize(review)
        for word in words:
            lemma = lmtzr.lemmatize(word.lower())
            new_review.append(lemma)
        new_reviews.append(' '.join(new_review))
    else:
Beispiel #47
0
# stemming-提取词干
# 导入stem.porter和Lancaster工具包
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
# 实例化PosterStemmer对象
porter_stemmer = PorterStemmer()
# 实例化LancasterStemmer对象
lancaster_stemmer = LancasterStemmer()
# 新建stemmed_list和lancaster_list数组,用于分别存放PorterStemmer和LancasterStemmer的结果
stemmed_list = []
lancaster_list = []
for token in tokens:
    stemmed_list.append(porter_stemmer.stem(token))
    lancaster_list.append(lancaster_stemmer.stem(token))
print("提取词干结果:")
print("1.PorterStemmer:", stemmed_list)
print("2.LancasterStemmer:", lancaster_list)

# Lemmatization-词形还原
# nltk的Lemmatization是基于WordNet实现的,导入WordNetLemmatizer。
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
# 新建lem_list数组,用于存放词形还原
lem_list = []
for token in tokens:
    lem_list.append(wordnet_lemmatizer.lemmatize(token))
print("词形还原结果:")
print(lem_list)
Beispiel #48
0
 def Lemmatize_words(self,words):
     lemmatizer = WordNetLemmatizer()
     l_words = [lemmatizer.lemmatize(w) for w in words]
     return " ".join(l_words)
        t = line.split()
        for i in range(len(t)):
            if len(t) > 2 and (t[i].startswith("(NN") or t[i].startswith("NN")
                               ) and not t[i].startswith("(NNP"):
                # get noun
                noun = t[i + 1].strip("))").lower()
                sentence_nouns.append(noun)
                if noun not in d:
                    d[noun] = {}

        for j in range(len(t)):
            # verb stuff here
            if len(t) > 2 and (t[j].startswith("(VB")):
                verb = t[j + 1].strip(")").lower()
                verb = wnl.lemmatize(verb, 'v')

                for n in sentence_nouns:
                    counts = d[n]
                    if verb in d[n]:
                        counts[verb] = counts[verb] + 1
                    else:
                        counts[verb] = 1
                    d[n] = counts

    print("finished " + filename + "!")

for noun in d:
    temp = []
    for key, value in sorted(d[noun].items(), key=itemgetter(1), reverse=True):
        temp.append((key, value))
def lem_abstract_pd(df):
    word_lem = WordNetLemmatizer()
    df['abstract_cleaned'] = df['abstract_cleaned'].apply(
        lambda x: [word_lem.lemmatize(y) for y in x])
    return df
    try:
        save_json(line_words, config.PATH_WORDS)
    except:
        os.remove(config.PATH_WORDS)
        exit(1)

# 4. Lemmatization using NLTK tool
if os.path.exists(config.PATH_LEM_WORDS):
    lemma_line_words = load_json(config.PATH_LEM_WORDS)
else:
    lemmatizer = WordNetLemmatizer()
    # first make a copy
    lemma_line_words = line_words.copy()
    for line_id, line_word in enumerate(line_words):
        for word_id, word in enumerate(line_word):
            lemma_line_words[line_id][word_id] = lemmatizer.lemmatize(word)
    try:
        save_json(lemma_line_words, config.PATH_LEM_WORDS)
    except:
        os.remove(config.PATH_LEM_WORDS)
        exit(1)

# 5. remove stopword using spacy
nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words

if os.path.exists(config.PATH_NO_STOP):
    no_stop_line_words = load_json(config.PATH_NO_STOP)
else:
    no_stop_line_words = []
    for line_id, line_word in enumerate(lemma_line_words):
Beispiel #52
0
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
for intent in intents['intents']:
    for pattern in intent['patterns']:

        #tokenize each word
        w = nltk.word_tokenize(pattern)
        words.extend(w)
        #add documents in the corpus
        documents.append((w, intent['tag']))

        # add to our classes list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

# lemmaztize and lower each word and remove duplicates
words = [
    lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words
]
words = sorted(list(set(words)))
# sort classes
classes = sorted(list(set(classes)))
# documents = combination between patterns and intents
print(len(documents), "documents")
# classes = intents
print(len(classes), "classes", classes)
# words = all words, vocabulary
print(len(words), "unique lemmatized words", words)

pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('classes.pkl', 'wb'))

# create our training data
Beispiel #54
0
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def penn_to_wn(tag):
    return get_wordnet_pos(tag)


for i in range(len(corpus), len(dataset)):
    review = dataset['text'][i]
    review = [
        lemmatizer.lemmatize(word, pos=penn_to_wn(nltk.pos_tag([word])[0][1]))
        for word in word_tokenize(review) if word not in string.punctuation
    ]
    review = ' '.join(review)
    corpus.append(review)

gc.collect()
#print(sys.getsizeof(corpus))
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(decode_error='ignore',
                        stop_words='english',
                        lowercase=True,
                        binary=False,
                        analyzer='word',
                        token_pattern='[A-z]{3,}',
                        ngram_range=(1, 1),
Beispiel #55
0
    ''.join(c for c in s if c not in punctuation) for s in word_tokens
]
# remove empty strings
word_tokens = [s for s in word_tokens if s]

# Removing stop words — frequent words such as ”the”, ”is”, etc. that do not have specific semantic to further cleanup the text corpus.
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in word_tokens if not w in stop_words]

# Lemmatisation unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language.
from nltk.stem import WordNetLemmatizer
# init the wordnet lemmatizer
lmtzr = WordNetLemmatizer()
lemm_tokens = [lmtzr.lemmatize(x) for x in filtered_tokens]

import nltk

bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(
    word_tokens)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(
    word_tokens)

import pandas as pd

#bigrams
bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq),
        import string
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]

        # remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]

        # filter out stop words
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]

        #lemmatization
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        CleanedText = ' '.join(words)

        from nltk.sentiment.vader import SentimentIntensityAnalyzer
        sid = SentimentIntensityAnalyzer()
        possentiment = sid.polarity_scores(CleanedText)['pos']
        negsentiment = sid.polarity_scores(CleanedText)['neg']
        comsentiment = sid.polarity_scores(CleanedText)['compound']
        possentiments.append(possentiment)
        negsentiments.append(negsentiment)
        comsentiments.append(comsentiment)

# In[9]:

details = zip(dates, media, possentiments, negsentiments, comsentiments)
Beispiel #57
0
data_cleaned = []
for doc in groups.data:
    doc_cleaned = ' '.join(word for word in doc.split() if word.isalpha())
    data_cleaned.append(doc_cleaned)

from sklearn.feature_extraction import stop_words
print(stop_words.ENGLISH_STOP_WORDS)

from nltk.corpus import names
all_names = set(names.words())

count_vector_sw = CountVectorizer(stop_words="english", max_features=500)

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

data_cleaned = []

for doc in groups.data:
    doc = doc.lower()
    doc_cleaned = ' '.join(
        lemmatizer.lemmatize(word) for word in doc.split()
        if word.isalpha() and word not in all_names)
    data_cleaned.append(doc_cleaned)

data_cleaned_count = count_vector_sw.fit_transform(data_cleaned)

print(count_vector_sw.get_feature_names_out())

# In[ ]:
Beispiel #58
0
class Myarticles(data.Dataset):
    def __init__(self, csvfile_path, txt_folder_path, glove_path='/Users/duanyiqun/Downloads/Textcls/glove.6B', validation=False):
        self.glove_path =glove_path
        #self.glove_init()
        #self.w2v = self.init_word2vec()
        self.articleor = self.Creat_article_list(csvfile_path)
        if validation:
            self.articles = self.articleor[200:250]
        else:
            self.articles = self.articleor[0:200]
        #self.articles = self.articleor[0:10]
        self.folderpath = txt_folder_path
        self.snowball_stemmer = SnowballStemmer('english')
        self.wordnet_lematizer = WordNetLemmatizer()
        self.delset = str.maketrans('', '', string.punctuation)
        self.remove_digits = str.maketrans('', '', digits)
        self.spelldict = enchant.Dict("en_US")
        self.init_word2idx()
        
        
    def est_dict(self,article_list):
        temp = []
        for index, _ in enumerate(article_list):
            filepath = os.path.join(self.folderpath,self.articles[index][0])
            print('analyze article {}'.format(index))
            with open(filepath) as f:
                article = f.read()
            article = self.CleanLines(article)
            article = self.SenToken(article)
            article = self.tokenize_to_word(article)
            article = self.spell_check_words(article)
            article = self.steamize_words(article)
            temp = temp + article[0]
        vocab = set(temp)
        word_to_ix = {word: i for i, word in enumerate(vocab)}
        pickle.dump(word_to_ix, open(f'word2_idx.pkl', 'wb'))    
    
    def save_wdx(self):
        self.est_dict(self.articles)
    
    def init_word2idx(self):
        self.word2idx = pickle.load(open(f'word2_idx.pkl', 'rb'))
        print('sucessfully load word dictionary with shape{}'.format(len(self.word2idx)))

    def __getitem__(self, index):
        filepath = os.path.join(self.folderpath,self.articles[index][0])
        with open(filepath) as f:
            article = f.read()
        article = self.CleanLines(article)
        article = self.SenToken(article)
        article = self.tokenize_to_word(article)
        article = self.spell_check_words(article)
        article = self.steamize_words(article)
        article = self.vectorize(article)
        sample = Variable(torch.from_numpy(article[0]))
        target = self.articles[index][1]
        return sample, target

    def Creat_article_list(self, csvfile, label= 'Basic '):
        df = pd.read_csv(csvfile)
        article_list = []
        for idx, cont in enumerate(df['Basics ']):
            if cont != 'NA':
                article_list.append([df['Name'][idx],cont])
        return article_list
    
    def SenToken(self,raw):#分割成句子
        sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_tokenizer.tokenize(raw)
        return  sents
    
    def CleanLines(self,line):
        #cleanline = re.sub('[:*~@^&()_+|\/><,.!\']', '', line)
        #delset = str.maketrans('', '', string.punctuation)
        cleanline = line.translate(self.delset)
        #cleanline = re.sub('0123456789', '', cleanline)
        #remove_digits = str.maketrans('', '', digits)
        cleanline = cleanline.translate(self.remove_digits)
        return cleanline
    
    def tokenize_to_word(self, article):
        words_tokenized =[]
        for sentence in article:
            sentence = self.CleanLines(sentence)
            sentence = nltk.word_tokenize(sentence)
            words_tokenized.append(sentence)
        return words_tokenized

    def steamize_words(self,article):

        for idx, sentence in enumerate(article):
            for ind, word in enumerate(sentence):
                word = self.snowball_stemmer.stem(word)
                sentence[ind] = self.wordnet_lematizer.lemmatize(word)
            sentence = [word for word in sentence if word not in stopwords.words('english')]
            article[idx] = sentence
        return article
    
    def spell_check_words(self,article):
        for idx, sentence in enumerate(article):
            for ind, word in enumerate(sentence):
                if not self.spelldict.check(word):
                    if self.spelldict.suggest(word) != []:
                        sentence[ind] = self.spelldict.suggest(word)[0]
                    else:
                        sentence[ind] = ' '
            article[idx] = sentence
        return article
    
    def vectorize(self,article):
        temp = []
        for idx, sentence in enumerate(article):
            for ind, word in enumerate(sentence):
                sentence[ind] = self.word2idx[word]
            temp.append(sentence)
        return np.array(temp)

    
    def tag_mask(self,article):
        mask = []
        for idx, sentence in enumerate(article):
            mask.append(nltk.pos_tag(sentence))
        return mask

    """
    def glove_init(self):
        words = []
        idx = 0
        word2idx = {}
        vectors = bcolz.carray(np.zeros(1), rootdir=f'{self.glove_path}/6B.50.dat', mode='w')

        with open(f'{self.glove_path}/glove.6B.50d.txt', 'rb') as f:
            #idx =0
            for l in f:
                line = l.decode().split()
                word = line[0]
                words.append(word)
                word2idx[word] = idx
                idx += 1
                vect = np.array(line[1:]).astype(np.float)
                vectors.append(vect)
                #print(idx+1)
    
        vectors = bcolz.carray(vectors[1:].reshape((400001, 50)), rootdir=f'{self.glove_path}/6B.50.dat', mode='w')
        vectors.flush()
        pickle.dump(words, open(f'{self.glove_path}/6B.50_words.pkl', 'wb'))
        pickle.dump(word2idx, open(f'{self.glove_path}/6B.50_idx.pkl', 'wb'))
    
    def init_word2vec(self):
        vectors = bcolz.open(f'{self.glove_path}/6B.50.dat')[:]
        words = pickle.load(open(f'{self.glove_path}/6B.50_words.pkl', 'rb'))
        word2idx = pickle.load(open(f'{self.glove_path}/6B.50_idx.pkl', 'rb'))
        glove = {w: vectors[word2idx[w]] for w in words}
        return glove 
    
    def word2map(self, article):
        matrix_len = 0
        for idx, sentence in enumerate(article):
            if matrix_len<len(sentence):
                matrix_len = len(sentence)
        vecarticle = []
        for idx, sentence in enumerate(article):        
            #words_found = 0
            weights_matrix = np.zeros((matrix_len, 50))
            for i, word in enumerate(sentence):
                try: 
                    weights_matrix[i] = self.w2v[word]
                    words_found += 1
                except:
                    print('key not founded, initialized random weights')
                    weights_matrix[i] = np.random.normal(scale=0.6, size=(50, ))  
            vecarticle.append(weights_matrix)
        
        return vecarticle
    """
    def __len__(self):
        return len(self.articles)
Beispiel #59
0
# Creating nice bags of words
	# Import WordNetLemmatizer
	from nltk.stem import WordNetLemmatizer

	# Retain alphabetic words: alpha_only
	alpha_only = [t for t in lower_tokens if t.isalpha()]

	# Remove all stop words: no_stops
	no_stops = [t for t in alpha_only if t not in english_stops]

	# Instantiate the WordNetLemmatizer
	wordnet_lemmatizer = WordNetLemmatizer()

	# Lemmatize all tokens into a new list: lemmatized
	lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

	# Create the bag-of-words: bow
	bow = Counter(lemmatized)

	# Print the 10 most common tokens
	print(bow.most_common(10))

# Using gensim

	# Import Dictionary
	from gensim.corpora.dictionary import Dictionary

	# Create a Dictionary from the articles: dictionary
	dictionary = Dictionary(articles)
temp1 = []
temp2 = []
simi = []
final = []
same_sent1 = []
same_sent2 = []

lemmatizer  =  WordNetLemmatizer()

for words1 in word_tokenize(str1):
    if words1 not in stop_words:
        if words1.isalnum():
            filtered_sentence1.append(words1)

for i in filtered_sentence1:
    lemm_sentence1.append(lemmatizer.lemmatize(i))

for words2 in word_tokenize(str2):
    if words2 not in stop_words:
        if words2.isalnum():
            filtered_sentence2.append(words2)

for i in filtered_sentence2:
    lemm_sentence2.append(lemmatizer.lemmatize(i))

for word1 in lemm_sentence1:
    simi =[]
    for word2 in lemm_sentence2:
        sims = []
        syns1 = wordnet.synsets(word1)
        syns2 = wordnet.synsets(word2)