Ejemplo n.º 1
1
def text_tokenize(sentence):
    #stemmer = SnowballStemmer('english')
    lmtr = WordNetLemmatizer()
    tokens = [x.lower() for x in word_tokenize(sentence) if x.isalpha()]
    tokens_tagged = nltk.pos_tag(tokens)
    tokens_tagged = [(x, get_wordnet_pos(y)) for (x, y) in tokens_tagged if x not in stopwords.words('english')]
    return [lmtr.lemmatize(x, y) if y != '' else x for (x, y) in tokens_tagged]
Ejemplo n.º 2
0
 def lemmatize_tweet(self, tweet):
     wordnet_lemmatizer = WordNetLemmatizer()
     new_tweet = ''
     for word in tweet:
         stemmed_word = wordnet_lemmatizer.lemmatize(word)
         new_tweet = new_tweet + stemmed_word
     return new_tweet
def generate_captions_and_comments():
	
	with open('./data/big_data_approx.json') as json_file:   
		video_data = json.load(json_file)
		
	video_num_comments, video_captions = np.array([ (video_datum["score"], video_datum["captions"]) 
                                              for _,video_datum in video_data.iteritems() ]).T
												
	# Define a stemmer and lemmatizer for use with our captions
	stemmer = PorterStemmer()
	lemmatizer = WordNetLemmatizer()


	combined_video_captions = []
	video_num_comments_cut  = []
	for caption_data_list,num_comments in zip(video_captions,video_num_comments):
		text = ""
		if caption_data_list is not None:
			video_num_comments_cut.append(num_comments)
			for caption_data in caption_data_list:
				if caption_data is not None and "text" in caption_data:
					for word in caption_data["text"].split():
						#text += (stemmer.stem(word)+" ")
						text += (lemmatizer.lemmatize(word)+" ")
			combined_video_captions.append(text[:-1])
		
	video_captions = combined_video_captions
	
	return (video_num_comments_cut, video_captions)
Ejemplo n.º 4
0
def search_posts(phrase, engine):
    lemmatizer = WordNetLemmatizer()
    words = ["(^|[^a-z])" + lemmatizer.lemmatize(word)
                for word in word_tokenize(phrase)
                    if word not in stopwords.words('english')
                    and len(word) >= 3]

    if len(words) == 0:
        return None

    params = {'phrase': "|".join(words)}
    query = ["SELECT link_id, url, title FROM threads", 
             "WHERE title_lower ~ %(phrase)s"]
    found = pd.read_sql(" ".join(query), 
                       engine, 
                       params=params)
    
    if len(found['link_id']) == 0: 
        return None 

    link_ids = ', '.join(found['link_id'].apply(lambda lid: "'" + lid + "'"))
    query = ["SELECT clean_body as body, affil, link_id FROM cleaned", 
             "WHERE link_id IN (" + link_ids + ")"]
    data = pd.read_sql(" ".join(query), engine)
    
    valid = data[data['body'].apply(lambda text: len(text.split()) >= 10 
                                 and not bool(re.search("[^a-z]bot[^a-z]", text)))]
    
    if valid.shape[0] < 60: 
        return None
    
    return valid, found.set_index('link_id')
Ejemplo n.º 5
0
def bow_score(hypothesis_list,text_list):
	wordnet_lemmatizer = WordNetLemmatizer()
	stop_word_list = ['a', 'an', 'the', ',', '.', ';', ':' ]
	i = 0
	while i < len(hypothesis_list):
		if hypothesis_list[i] in stop_word_list:
			del hypothesis_list[i]
			i = i - 1
		i = i  + 1
	if len(hypothesis_list) == 0:
		return 0
	i = 0	
	while i < len(text_list):
		if text_list[i] in stop_word_list:
			del text_list[i]
			i = i - 1
		i = i + 1
	if len(text_list) == 0:
		return 0
	## Stop words removed up until here

	score = 0	
	for word_text in text_list:
		lemma_text = wordnet_lemmatizer.lemmatize(word_text)
		for word_hypothesis in hypothesis_list:
			lemma_hypothesis = wordnet_lemmatizer.lemmatize(word_hypothesis)
			print lemma_hypothesis
			print lemma_text
			score += lexical_compare(lemma_text,lemma_hypothesis)
			print str(score)
	return score
Ejemplo n.º 6
0
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
    supported_stemmers = [
        "PorterStemmer", "SnowballStemmer",
        "LancasterStemmer", "WordNetLemmatizer"]
    if type is False or type not in supported_stemmers:
        return words_l
    else:
        l = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "WordNetLemmatizer":  # TODO: context
            wnl = WordNetLemmatizer()
            for word in words_l:
                l.append(wnl.lemmatize(word).encode(encoding))
        return l
Ejemplo n.º 7
0
def getFeature():
    with open(os.path.join('spam_filter_train.txt'), 'r') as f:
        trainData = f.readlines()
    with open(os.path.join('spam_filter_test.txt'), 'r') as f:
        testData = f.readlines()
    data = trainData + testData
    trainNum, testNum = len(trainData), len(testData)
    del trainData
    del testData

    for i in range(len(data)):
        data[i] = data[i].replace('\n', '').split('\t')[1]
    # lemmatize
    lemmatized = []
    wnl = WordNetLemmatizer()
    for line in data:
        lemmatized.append([wnl.lemmatize(word) for word in line.split(' ')])
    # remove stopwords
    stopwordRemoved = []
    sw = set(stopwords.words('english'))
    for line in lemmatized:
        stopwordRemoved.append(' '.join([x for x in line if x not in sw]))
    # tf feature
    vec = CountVectorizer()
    features = vec.fit_transform(stopwordRemoved)

    with open('trainFeatures.pkl', 'wb') as f:
        cPickle.dump(features[:trainNum], f)
    with open('testFeatures.pkl', 'wb') as f:
        cPickle.dump(features[trainNum:], f)
def labelBasedEntry(term,uri):
    wnl = WordNetLemmatizer()
    hm = {}
    sparql = Sparql.Connection()
    if " " in term:
        term = term.split(" ")[1]
        
    stem = wnl.lemmatize(term)
    wiktionary_informations = sparql.getWiktionaryInformationsNEW(stem)
    for x in wiktionary_informations:
        if " + " in x[0] and "," not in x[0] and "*" not in x[0]:
            tmp = x[0].split(" + ")[0]
            if "Adjective" in x[1]:
                hm[LexiconGenerator.AdjectivePPFrame(tmp, uri,{})] = ""
            if "Verb" in x[1]:
                hm[LexiconGenerator.TransitiveFrame(tmp, uri,{})] = ""
            if "Noun" in x[1]:
                hm[LexiconGenerator.NounPPFrame(tmp,uri,{})] = ""
        elif "," not in x[0] and "*" not in x[0]:
            if "Adjective" in x[1]:
                hm[LexiconGenerator.AdjectivePPFrame(term, uri,{})] = ""
            if "Verb" in x[1]:
                hm[LexiconGenerator.TransitiveFrame(term, uri,{})] = ""
            if "Noun" in x[1]:
                hm[LexiconGenerator.NounPPFrame(term,uri,{})] = ""

    if len(wiktionary_informations) == 0:
        hm[LexiconGenerator.TransitiveFrame(stem, uri,{})]  = ""
        hm[LexiconGenerator.NounPPFrame(stem,uri,{})] = ""
        
    entry = []
    for key in hm:
        entry.append(key)
            
    return entry
Ejemplo n.º 9
0
    def getBoW(self, instance):
        bowFeatures = {}

        # tokens in the third position
        tokens = instance[3]
        # pos tag
        wordnet_lemmatizer = WordNetLemmatizer()
        tagged = nltk.pos_tag(tokens)
        i = 0
        for tag in tagged:
            if instance[2] == i:
                i +=1
                continue
                #sys.stderr.write('remove target word (%s)\n' % tag[0])
            elif tag[0] in stopwords.words("english"):
                i +=1
                continue
                #sys.stderr.write('stopword (%s)\n' % tag[0])
            elif re.match("N.*", tag[1]):
                bowFeatures['bow(%s)' %  wordnet_lemmatizer.lemmatize(tag[0], pos="n")] = True
            elif re.match("V.*", tag[1]):
                bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="v")] = True
            elif re.match("R.*", tag[1]):
                bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="r")] = True
            elif re.match("J.*", tag[1]):
                bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="a")] = True
            i += 1
        return bowFeatures
Ejemplo n.º 10
0
def preprocess(line, is_lmz=False):
    line = wordpunct_tokenize(line.strip())
    if is_lmz:
        lemmatizer = WordNetLemmatizer()
        line = [lemmatizer.lemmatize(word) for word in line]

    return line
Ejemplo n.º 11
0
    def negator(self,wordVec):
        negation = False
        negated_doc = []
        lemmatizer = WordNetLemmatizer()
        for w,p in wordVec:
            w_out = ""
            if (p[:2] == "NN"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.NOUN)
            elif (p[:2] == "JJ"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADJ)
            elif (p[:2] == "VB"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.VERB)
            elif (p[:2] == "RB"):
                w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADV)
            if(w_out == "not" or w_out == "n't" ):
                #print "blah"
                negation = not negation
                #rint negation
            elif(w_out in string.punctuation and w_out != ''):

                negation = False
            elif(negation):
                #print negation
                w_out = "NOT_"+w_out
            negated_doc.append((w_out,p))
        #print negated_doc
        return negated_doc
def getWordCounts(WordCloudTweetNo):
    print('Fetching the most commonly used {0} words in the "{1}" feed...'.format(WordCloudTweetNo, ScreenName))
    cur = "DELETE FROM WordsCount;"
    conn.execute(cur)
    conn.commit()
    cur = 'SELECT tweet_text FROM UserTimeline'
    data = conn.execute(cur)
    StopList = stopwords.words('english')
    Lem = WordNetLemmatizer()
    AllWords = ''
    for w in tqdm(data.fetchall(),leave=1):
            try:
                #remove certain characters and strings
                CleanWordList = re.sub(r'http://[\w.]+/+[\w.]+', "", w[0], re.IGNORECASE)
                CleanWordList = re.sub(r'https://[\w.]+/+[\w.]+', "", CleanWordList, re.IGNORECASE)
                CleanWordList = re.sub(r'[@#\[\]\'"$.;{}~`<>:%&^*()-?_!,+=]', "", CleanWordList)
                #tokenize and convert to lower case
                CleanWordList = [words.lower() for words in word_tokenize(CleanWordList) if words not in StopList]
                #lemmatize words
                CleanWordList = [Lem.lemmatize(word) for word in CleanWordList]
                #join words
                CleanWordList =' '.join(CleanWordList)
                AllWords += CleanWordList
            except Exception as e:
                print (e)
                sys.exit(e)
    if AllWords is not None:
        words = [word for word in AllWords.split()]
        c = Counter(words)
        for word, count in c.most_common(WordCloudTweetNo):
            conn.execute("INSERT INTO WordsCount (word, frequency) VALUES (?,?)", (word, count))
            conn.commit()
Ejemplo n.º 13
0
def preprocess_text(raw_text):
    """
        文本预处理操作
        参数:
            - raw_text  原始文本
        返回:
            - proc_text 处理后的文本
    """
    # 全部转换为小写
    raw_text = raw_text.lower()

    # 1. 使用正则表达式去除标点符号
    filter_pattern = re.compile('[%s]' % re.escape(string.punctuation))
    words_only = filter_pattern.sub('', raw_text)

    # 2. 分词
    raw_words = nltk.word_tokenize(words_only)

    # 3. 词形归一化
    wordnet_lematizer = WordNetLemmatizer()
    words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]

    # 4. 去除停用词
    filtered_words = [word for word in words if word not in stopwords.words('english')]

    proc_text = ' '.join(filtered_words)

    return proc_text
Ejemplo n.º 14
0
def split_into_words(text, lemmatize=False, reattach=True, replace_numbers=True, split_off_quotes=True,
                     fix_semicolon_mistakes=True):

    if fix_semicolon_mistakes:
        text = fix_semicolons(text)

    word_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    # get rid of certain character so that we can use those for special purposes
    tokens = word_tokenizer.tokenize(text)
    if reattach:
        tokens = reattach_clitics(tokens)

    if split_off_quotes:
        tokens = split_off_quote_marks(tokens)

    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    if replace_numbers:
        tokens = [re.sub('[0-9]', '#', t) for t in tokens]

    tokens = split_off_final_punctuation(tokens)

    tokens = split_off_commas(tokens)

    return tokens
Ejemplo n.º 15
0
def lemmatize_tweets(input_path, output_path):
    wordnet_lemmatizer = WordNetLemmatizer()

    input_files = glob.glob("%s/dataset_*.out" % input_path)

    for input_file in input_files:
        results = re.search('(dataset_.+)\.out', input_file)
        filename = results.groups()[0]

        output_file = "%s/%s_converted.out" % (output_path, filename)
        output_file2 = "%s/%s_converted.out.id" % (output_path, filename)

        with codecs.open(output_file, encoding='utf-8', mode='w') as out:
            with codecs.open(output_file2, encoding='utf-8', mode='w') as out2:
                print >>out, "<doc>"
                with codecs.open(input_file, encoding='utf-8', mode='r') as f:
                    for line in f:
                        if re.search('TWEETID(\d+)START', line):
                            results = re.match('TWEETID(\d+)START', line)
                            groups = results.groups()
                            print >>out, "<p>"
                            print >>out2, "ID=%d" %(int(groups[0]))

                        elif re.search('TWEETID(\d+)END', line):
                            print >>out, "<\p>"
                        elif re.search("(.+)\t(.+)\t(.+)\n", line):
                            results = re.match("(.+)\t(.+)\t(.+)\n", line)
                            groups = results.groups()
                            word = groups[0]
                            pos = groups[1]
                            lemma = wordnet_lemmatizer.lemmatize(word)
                            print >>out, "%s\t%s\t%s" %(word, pos, lemma)
                            print >>out2, "%s\t%s\t%s" %(word, pos, lemma)

                print >>out, "</doc>"
def create_lexicon(pos_file, neg_file):
    lex = []

    # 读取文件
    def process_file(_f):
        with open(_f, 'r') as f:
            lex = []
            lines = f.readlines()
            # print(lines)
            for line in lines:
                words = word_tokenize(line.lower())
                lex += words
            return lex

    lex += process_file(pos_file)
    lex += process_file(neg_file)
    # print(len(lex))
    lemmatizer = WordNetLemmatizer()
    lex = [lemmatizer.lemmatize(word) for word in lex]  # 词形还原 (cats->cat)

    word_count = Counter(lex)
    # print(word_count)
    # {'.': 13944, ',': 10536, 'the': 10120, 'a': 9444, 'and': 7108, 'of': 6624, 'it': 4748, 'to': 3940......}
    # 去掉一些常用词,像the,a and等等,和一些不常用词; 这些词对判断一个评论是正面还是负面没有做任何贡献
    lex = []
    for word in word_count:
        if word_count[word] < 2000 and word_count[word] > 20:  # 这写死了,好像能用百分比
            lex.append(word)  # 齐普夫定律-使用Python验证文本的Zipf分布 http://blog.topspeedsnail.com/archives/9546
    return lex
Ejemplo n.º 17
0
def main(in_file="zargan/data/stats20080113-02.txt", out_file="zargan/data/filtered.txt"):
    f = codecs.open(in_file, encoding="iso-8859-1")
    o = open(out_file, "w")
    eliminated = open("zargan/data/eliminated.txt", "w")
    o.write(f.readline())
    wnl = WordNetLemmatizer()

    for line in f:
        cols = line.split("|")

        if len(cols) != 12:
            continue
        if filter_ip(cols):
            continue
        if filter_corporation(cols):
            continue
        if filter_campaign(cols):
            continue

        word = cols[0]
        ip = cols[1]

        if filter_dictionary(cols):
            eliminated.write("%s\n" % char_fix(word).lower())
            continue
        if cols[8] == 2:
            cols[0] = wnl.lemmatize(cols[0])
        o.write("%s\n" % char_fix("|".join(cols).encode("utf8")))
    o.close()
    eliminated.close()
Ejemplo n.º 18
0
def GetCleanWords(content_string):
    
    # Tokenize the sentences using hte Punkt word Tokenizer
    tokenized_words = PunktWordTokenizer().tokenize(content_string)
    
    #Now let's remove the stop words
    tokenized_words = [word for word in tokenized_words if word.lower() not in stopwords_list]
    
    # Now let's remove all of the solely punctuation.
    punctuation_list = ['.',',',';',':','!','?']
    tokenized_words = [word for word in tokenized_words if word not in punctuation_list]
    
    # Finally let's get rid of the punctuation at the end of each word
    cleaned_words = []
    for word in tokenized_words:
        if word[-1] in punctuation_list:
            cleaned_words.append(word[:-1])
        else:
            cleaned_words.append(word)
    
    # Now let's stem each of the words to lower our word count
    wnl = WordNetLemmatizer()
    clean_and_stemmed_words = [wnl.lemmatize(cleaned_word) for cleaned_word in cleaned_words] 
    
    return clean_and_stemmed_words
Ejemplo n.º 19
0
 def __tokenize(self, text):
     if text.lower()!='not available\n':
         lemms=[]
         wnl = WordNetLemmatizer()
         #st = PorterStemmer()
         for item in self.__tok(text):
             if item.isalpha():
                 lemms.append(wnl.lemmatize(item.lower()))
                 #lemms.append(st.stem(item.lower()))
             else:
                 if item.isdigit():
                     if int(item)>=1700 and int(item)<=2100:
                         lemms.append('YEAR')
                     else:
                         lemms.append('DIGIT')
                 else:
                     #pass
                     lemms.append(item)
                     if item[-2:]=='th' and item[:-2].isdigit() or item[-2:]=='st' and item[:-2].isdigit() or item[-2:]=='nd'and item[:-2].isdigit() or item[-2:]=='rd'and item[:-2].isdigit():
                         lemms.append('ORDERNUM')
                     elif item[-2:]=='pm' and item[:-2].isdigit() or item[-2:]=='am' and item[:-2].isdigit():
                         lemms.append('HOUR')
                     elif item=='4EXCL' or item=='5QUEST' or item=='6POINT':
                         lemms.append(item)
                     else:
                         lemms.append('NAME_NAME')
                         #print(item)
     else:
         lemms=[]
     #print(lemms)
     return lemms
Ejemplo n.º 20
0
def tokenizer(message):
	''' Tokenize/Lemmatize the words'''
	message = unicode(message,'utf-8').lower()
	message = remove_punctuation_unicode(message)
	words = [ word for word in word_tokenize(message) if word not in stop_words ]
	WNL = WordNetLemmatizer()
	return [ WNL.lemmatize(word) for word in words ]
Ejemplo n.º 21
0
def get_clean_text(list_filenames, path_to_file):
    '''
    parameter:
    ----------
    list_filenames: as LST is a list of filename as STR
    path_to_file: as STR is the path to the file containing movie scripts
    --> such that path_to_file/filename.txt is the file to open

    returns:
    --------
    list of list of words (lemmatize, lowercase) in the text (order preserved)
    '''
    wnl = WordNetLemmatizer()
    list_texts_as_words = []
    for filename in list_filenames:
        path_file = path_to_file+"/"+filename+".txt"
        with open(path_file) as f:
            text = f.readlines()
            lines = [line.strip() for line in text if line.strip()]
            string_words = []
            for line in lines:
                words = [wnl.lemmatize(word.lower()) for word in line.split(' ') if wnl.lemmatize(word.lower())]
                string_words += words
        list_texts_as_words.append(string_words)
    return list_texts_as_words
	def mapper(self, _, line):
		lnum,gsid,reviewer,postdate,stars,review = line.split(',',5)

                # skip if this is the first/header line
                if (gsid == "\"gsid\""): 
                        return

		# skip if this school is not rated
		if((stars == "0") or (stars == "99")):
#			print "Skipping ", stars
			return
                
                # lower case all review text
                review = review.lower()

                # remove html encodings
                review = review.replace('&amp;','').replace('&lt;','').replace('&gt;','').replace('&quot;','').replace('&#039;','').replace('&#034;','')

                # remove punctuation
                review = review.translate(string.maketrans("",""), string.punctuation)

                # remove stop words
		stopset = set(nltk.corpus.stopwords.words('english'))
                review = [word for word in review.split() if not word in stopset]

		# Use nltk's lemmatizer to create word stems
		wnl = WordNetLemmatizer()
		review = [wnl.lemmatize(word) for word in review]

		# Limit use of word to once per review
                words = set(review)

                for word in words:
                        yield word, stars
def possibility():
    wnl = WordNetLemmatizer()
    verb = wnl.lemmatize(verbs[random.randrange(0, len(verbs))])
    noun = wnl.lemmatize(nouns[random.randrange(0, len(nouns))])

    article = "a"
    if noun[0] in ["a", "e", "i", "o", "u"]:
        article = "an"

    if random.randrange(0, 100) < chance_quantity:
        quantity_word = quantity_adverbs[random.randrange(0, len(quantity_adverbs))]
        if not noun.endswith("s") and not noun.endswith("y") and not quantity_word == "numerous":
            noun += "s"
        possibility = verb + " " + quantity_word + " of the " + noun

    elif random.randrange(0, 100) < chance_location:
        location_word = location_adverbs[random.randrange(0, len(location_adverbs))]
        possibility = (
            verb
            + " "
            + article
            + " "
            + noun
            + " "
            + location_word
            + " the "
            + wnl.lemmatize(nouns[random.randrange(0, len(nouns))])
        )

    else:
        possibility = verb + " " + article + " " + noun

    return possibility
Ejemplo n.º 24
0
def getdata():
    """
    retrieves the data from repository table
    removes the special characters and stop words and does the stemming(using nltk package)
    """
    conn=db.getDBConnection()
    cursor = conn.cursor()
    global stopWordSet
    sql = "select id, description from repository"
    rows = db.executeSQL(conn, sql)
    counter=1
    wnl = WordNetLemmatizer()
    for row in rows:
        id = row[0]
        desc= row[1]
        #print desc
        if desc is not None:
            desc=desc.replace('-',' ').replace(',',' ').replace('/',' ').replace('.',' ').replace('_',' ')
            desc = desc.lower()
            desc = re.sub('[^a-z0-9 ]','',desc)
            keywords = desc.split(" ")
            for word in keywords:
                #word = porter.stem(word.strip())
                word=wnl.lemmatize(word.strip())
                if word not in stopWordSet:
                    sql1 = "insert into keywords1 values("+str(counter)+",'"+word+"',"+str(id)+ ',' + str(0) + ")"
                    print sql1
                    cursor.execute(sql1)
                    conn.commit()
                    counter = counter+1
Ejemplo n.º 25
0
    def lemmatize(self):
        wnl = WordNetLemmatizer()
        self.lemma_list = []

        for i in self.tokens_no_punct:
            lemmy_word = wnl.lemmatize(i)
            self.lemma_list.append(unicode(lemmy_word))
def preProcessHistogram(documents):

    """
    preProcessHistogram(listofString) -> listOfString

    consumes a listofSentences and Tokenizes it, and returns a list of lemmatized words.

    """

    paragraph = ""
    for sentence in documents:
        paragraph = paragraph + " " + sentence.lower()
   
    #make all words lowercase and remove all punctuation         
    lowerCaseParagraph = paragraph.translate(maketrans("",""),punctuation) 
    
    words = lowerCaseParagraph.split()  
    
    
    lemmatizer = WordNetLemmatizer()
    #lemmatize every word.... (if it needs to be lemmatized) and remove words that are too long, because chances are they arent words.
    words = map(lambda x: lemmatizer.lemmatize(x,'v'), words)
    words = filter(lambda x: len(x) < 10 or x.isdigit() , words)
    
    return words
def _lemma_(token):

	if isinstance(token, str):
		return _stem_(token)
	if isinstance(token, unicode):
		return _stem_(token)
	from nltk.corpus import wordnet

	def get_wordnet_pos(treebank_tag):

		if treebank_tag.startswith('J'):
			return wordnet.ADJ
		elif treebank_tag.startswith('V'):
			return wordnet.VERB
		elif treebank_tag.startswith('N'):
			return wordnet.NOUN
		elif treebank_tag.startswith('R'):
			return wordnet.ADV
		else:
			return ''

	from nltk.stem import WordNetLemmatizer
	wordnet_lemmatizer = WordNetLemmatizer()
	p = get_wordnet_pos(token.pos()[0][1])
	if p!=wordnet.VERB:
		return _stem_(token[0])
	rs = wordnet_lemmatizer.lemmatize(token[0], pos=p)
	return rs
Ejemplo n.º 28
0
def lemmatize(tweets):
	'''
	Lemmatize words in the corpus.
	
	Input:
	------------------
	tweets: List of lists, [[word1OfTweet1, word2OfTweet1,...,word_m1OfTweet1],
						   	[word1OfTweet2, word2OfTweet2,...,word_m2OfTweet2],
						   						. 								
						   						. 
						   						.
						    [word1OfTweetN, word2OfTweetN,...,word_mNOfTweetN]]
	Output:
	-----------------
	newTweets: All the words in the tweet lemmatized.
	'''
	wordnet_lemmatizer = WordNetLemmatizer()
	pos_tag_tweets = [nltk.pos_tag(t) for t in tweets]
	tweets = []
	i = 0
	for t in pos_tag_tweets:
		tt = []
		for w in t:
			if get_wordnet_pos(w[1]) =='':
				tt.append(w[0])
			else:
				try:
					tt.append(wordnet_lemmatizer.lemmatize(w[0], pos = get_wordnet_pos(w[1])))
				except UnicodeDecodeError:
					pass
		tweets.append(tt)
		i += 1
	return tweets
def preprocess_wikidata(raw):
 # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower().split('../img/')[0]
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return (tokens, text)
def get_words_list(dataset):
	'''
	Loading dataset and read contents, use tokenize to get tokens and lemmatize the words.
	'''

	# join the path and file name together
        spam_path = 'data/enron/pre/'+ dataset + '/spam/'
	ham_path = 'data/enron/pre/'+ dataset + '/ham/'
        spam_npl = [i[-1] for i in os.walk(spam_path)][0]
        ham_npl = [i[-1] for i in os.walk(ham_path)][0]

        spam_fl = (open(os.path.join(spam_path, j)).read().lower() for j in spam_npl)
	ham_fl = (open(os.path.join(ham_path, j)).read().lower() for j in ham_npl)

        splitter = re.compile("\\W*")
	english_stops = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	# tokenize the files into words
	spam_wl = [None]*len(spam_npl)
	for i,f in enumerate(spam_fl):
		spam_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
				if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]
        
	ham_wl = [None]*len(ham_npl)
	for i,f in enumerate(ham_fl):
		ham_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \
				if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20]

	return spam_wl, ham_wl
Ejemplo n.º 31
0
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(x, pos='v')
Ejemplo n.º 32
0
def lemmatize_stemming(text, stemmer):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
 def __init__(self):
     self.wnl = WordNetLemmatizer()
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc.lower())]
Ejemplo n.º 35
0
    file_path='./Dataset/'
    save_path='./SData/'
    file_class=['Training-800/Crude','Training-800/Grain','Training-800/Interest','Training-800/Trade','Testing-40']
    for i,x in enumerate(file_class):
        file_name = os.listdir(file_path+str(x))
        file_name.sort(key=lambda x:int(x[:-4]))
        for file in file_name:
            file_ = open(file_path+str(x)+'/'+file)
            text = file_.read()
            result = re.sub(r'\d+','',text)
            # print(result)
            tokens = word_tokenize(result)
            token_words = pos_tag(tokens)

            words_lematizer = []
            wordnet_lematizer = WordNetLemmatizer()
            for word, tag in token_words:
                if tag.startswith('NN'):
                    word_lematizer =  wordnet_lematizer.lemmatize(word, pos='n')  # n代表名词
                elif tag.startswith('VB'):
                    word_lematizer =  wordnet_lematizer.lemmatize(word, pos='v')   # v代表动词
                elif tag.startswith('JJ'):
                    word_lematizer =  wordnet_lematizer.lemmatize(word, pos='a')   # a代表形容词
                elif tag.startswith('R'):
                    word_lematizer =  wordnet_lematizer.lemmatize(word, pos='r')   # r代表代词
                else:
                    word_lematizer =  wordnet_lematizer.lemmatize(word)
                words_lematizer.append(word_lematizer)
            cleaned_words = [word for word in words_lematizer if word not in stopwords.words('english')]
            characters = [',', '.','DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$',"'", '%','--','-','...','^','{','}',"''","``"]
            words_list = [word for word in cleaned_words if word not in characters]
Ejemplo n.º 36
0
def extract_Relevant_Features(filename):
    with open(filename, 'r') as text_file:
        my_corpus = text_file.read().split('\n')

    # Initializing list for tokenized and tagged corpus
    token_and_tagged_corpus = []

    # For each record, calling the tokenizer and tagger
    for record in my_corpus:
        if record:
            text = record.split('|')[1]
            token_and_tagged_corpus += tokenize_and_tag_text(text)

    # Chunking the tagged, and tokenized corpus
    chunk_sents = chunk_tagged_sents(token_and_tagged_corpus)

    # From the chunked data, retrieve the list
    chunks_list = []
    chunk_header = []

    for i in range(chunktypes):
        chunktype = 'P' + str(i + 1)
        chunks_from_sents = get_just_chunks(chunk_sents, chunktype)
        freq_type_chunks = freq_chunks(chunks_from_sents, 100)
        chunk_header.append(chunktype)
        chunks_list.append(freq_type_chunks)

    table_chunking = ListTable()
    table_chunking.append(chunk_header)

    iteration = 0
    for chunklist in chunks_list:
        if len(chunklist) > iteration:
            iteration = len(chunklist)

    for i in range(iteration):
        row = []
        for chunklist in chunks_list:
            if i < len(chunklist):
                row.append(chunklist[i])
            else:
                row.append('')

        table_chunking.append(row)

    wnl = WordNetLemmatizer()
    word_dict = {}
    features = []
    for row in table_chunking:
        for col in row:
            features.append(col)

    for feature in features:
        words = feature.split(' ')
        for word in words:
            word_lemma = wnl.lemmatize(word.lower())
            if word_lemma in word_dict: word_dict[word_lemma].append(feature)
            else: word_dict[word_lemma] = [feature]

    key_pair_dict = {}

    for key1 in word_dict:
        if wn.synsets(key1, 'n') == []:
            continue
        for key2 in word_dict:
            if key1 == key2:
                continue
            if wn.synsets(key2, 'n') == []:
                continue
            key_pair = tuple(sorted([key1, key2]))
            if key_pair in key_pair_dict:
                continue
            else:
                feature_list1 = word_dict[key1]
                feature_list2 = word_dict[key2]
                similarity = avg_similarity(feature_list1, feature_list2)
                if similarity > 0.8:
                    key_pair_dict[key_pair] = similarity

    sorted_key_pair_dict = sorted(key_pair_dict.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)

    key_pair_list = []
    for keypair, sim in sorted_key_pair_dict:
        key_pair_list.append(keypair)

    topics = {}
    keypair_copy = key_pair_list[:]

    for key11, key12 in key_pair_list:
        for keypair2 in key_pair_list:
            if key11 == keypair2[0] and key12 == keypair2[1]: continue
            if key11 in keypair2:
                if key11 in topics: topics[key11] += [keypair2[0], keypair2[1]]
                else:
                    topics[key11] = [key11, key12, keypair2[0], keypair2[1]]
            if key12 in keypair2:
                if key12 in topics: topics[key12] += [keypair2[0], keypair2[1]]
                else:
                    topics[key12] = [key11, key12, keypair2[0], keypair2[1]]

    for item in topics:
        topics[item] = list(set(topics[item]))

    term_list = []
    for topic in topics:
        term_list.append(topic)
    producttype = ['cell', 'phone', 'telephone', 'mobile']

    # Dictionary with final 'features' and associated list of similar words for each feature
    final_term_dict = {}

    for term in term_list:
        sim = avg_similarity_words(term, producttype)
        if sim > 0.70:
            final_term_dict[term] = list(set(topics[term]))

    return final_term_dict
Ejemplo n.º 37
0
def fn_nvarFilt(poses, nvar):
    nvjr = list(nvar)
    ja = lambda tag: tag if tag != 'j' else 'a'
    nvars = [(p0, ja(p1[:1].lower())) for p0, p1 in poses if p1[:1] in nvjr]
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(nvar[0], pos=nvar[1]) for nvar in nvars]
class WordNetSimilarity:
    """Extend the NLTK's WordNet with more similarity metrics, word lemmatization, and multilingual."""
    def __init__(self, ic_corpus='brown'):
        self._ic_corpus = wordnet_ic.ic(
            'ic-brown.dat') if ic_corpus == 'brown' else wordnet_ic.ic(
                'ic-semcor.dat')
        self._wn_max_depth = 19
        self._default_metrics = [
            'path', 'lch', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath'
        ]
        self._wn_lemma = WordNetLemmatizer()

    def method(self, name):
        def function(syn1, syn2):
            score = getattr(self, name)(syn1, syn2)
            return abs(score)

        return function

    def synset_expand(self, s):
        result = [s]
        hypos = s.hyponyms()
        if not hypos:
            return result
        for h in hypos:
            result.extend(self.synset_expand(h))
        return result

    #return all the noun synsets in wordnet
    def get_all_synsets(self):
        return wn.all_synsets('n')

    def get_all_lemma_names(self):
        return wn.all_lemma_names('n')

    def offset2synset(self, offset):
        '''
        offset2synset('06268567-n')
        Synset('live.v.02')
        '''
        return wn._synset_from_pos_and_offset(str(offset[-1:]),
                                              int(offset[:8]))

    def synset2offset(self, ss):
        return '%08d-%s' % (ss.offset(), ss.pos())

    #semcor live%2:43:06::
    def semcor2synset(self, sense):
        return wn.lemma_from_key(sense).synset()

    def semcor2offset(self, sense):
        '''
        semcor2synset('editorial%1:10:00::')
        06268567-n
        '''
        return self.synset2offset(self.semcor2synset(sense))

    def word2synset(self, word, pos=wn.NOUN):
        word = self._wn_lemma.lemmatize(word)
        return wn.synsets(word, pos)

    def languages(self, l=None):
        """Return a list of supported languages or find the corresponding language code of supported language.

        :param l: The default value is None or the name of language
        :return: if the default none is set, return a list of supported language names. When l is assigned with a
        language name, the corresponding code is returned.

        User should use this function to check the languages and find the language code.
        """
        langs = {
            'albanian': 'als',
            'arabic': 'arb',
            'bulgarian': 'bul',
            'chinese_simplified': 'cmn',
            'chinese_traditional': 'qcn',
            'danish': 'dan',
            'greek': 'ell',
            'english': 'eng',
            'persian': 'fas',
            'finnish': 'fin',
            'french': 'fra',
            'hebrew': 'heb',
            'croatian': 'hrv',
            'icelandic': 'isl',
            'italian': 'ita',
            'japanese': 'jpn',
            'catalan': 'cat',
            'basque': 'eus',
            'galicain': 'glg',
            'spanish': 'spa',
            'indonesian': 'ind',
            'malay': 'zsm',
            'dutch': 'nld',
            'polish': 'pol',
            'portuguese': 'por',
            'romanian': 'ron',
            'lithuanian': 'lit',
            'slovak': 'slk',
            'slovene': 'slv',
            'swedish': 'swe',
            'thai': 'tha'
        }
        if l:
            if l.lower() in langs:
                return langs[l.lower()]
            else:
                return l + " is not supported!"
        return map(lambda x: x.capitalize(), langs.keys())

    def multilingual2synset(self, word, lang='spa'):
        """
        Map words in different language to wordnet synsets
        ['als', 'arb', 'cat', 'cmn', 'dan', 'eng', 'eus', 'fas', 'fin', 'fra', 'fre',
         'glg', 'heb', 'ind', 'ita', 'jpn', 'nno','nob', 'pol', 'por', 'spa', 'tha', 'zsm']
        :param word: a word in different language that has been defined in
         Open Multilingual WordNet, using ISO-639 language codes.
        :param lang: the language code defined
        :return: wordnet synsets.
        """
        return wn.synsets(word.decode('utf-8'), lang=lang, pos=wn.NOUN)

    @memoized
    def similarity(self, c1, c2, name='wpath'):
        """
        Compute semantic similarity between two concepts
        :param c1:
        :param c2:
        :param name:
        :return:
        """
        return self.method(name)(c1, c2)

    def max_synset_similarity(self, syns1, syns2, sim_metric):
        """
        Compute the maximum similarity score between two list of synsets
        :param syns1: synset list
        :param syns2: synset list
        :param sim_metric: similarity function
        :return: maximum semantic similarity score
        """
        return max([sim_metric(c1, c2) for c1 in syns1 for c2 in syns2] + [0])

    @memoized
    def word_similarity(self, w1, w2, name='wpath'):
        """ Return similarity score between two words based on WordNet.

        :param w1: first word to be compared which should be contained in WordNet
        :param w2: second word to be compared which should be contained in WordNet
        :param name: the name of knowledge-based semantic similarity metrics
        :return: numerical score indicating degree of similarity between two words. The
        minimum score is 0. If one of the input words is not contained in WordNet, 0 is given. The up bound of
        the similarity score depends on the similarity metric you use. Bigger similarity values indicate higher
        similarity between two words.
        :rtype : Float

        """
        s1 = self.word2synset(w1)
        s2 = self.word2synset(w2)
        sim_metric = lambda x, y: self.similarity(x, y, name)
        return self.max_synset_similarity(s1, s2, sim_metric)

    @memoized
    def best_synset_pair(self, w1, w2, name='wpath'):
        s1 = self.word2synset(w1)
        s2 = self.word2synset(w2)
        sims = Counter({(c1, c2): self.similarity(c1, c2, name)
                        for c1 in s1 for c2 in s2})
        return sims.most_common(1)[0][0]

    def word_similarity_all_metrics(self, w1, w2):
        return {
            m: self.word_similarity(w1, w2, name=m)
            for m in self._default_metrics
        }

    @memoized
    def word_similarity_wpath(self, w1, w2, k):
        s1 = self.word2synset(w1)
        s2 = self.word2synset(w2)
        sim_metric = lambda x, y: self.wpath(x, y, k)
        return self.max_synset_similarity(s1, s2, sim_metric)

    @memoized
    def monol_word_similarity(self, w1, w2, lang='spa', name='wpath'):
        """
         Compute mono-lingual word similarity, two words are in same language.
        :param w1: word
        :param w2: word
        :param lang: language code
        :param name: name of similarity metric
        :return: semantic similarity score
        """
        s1 = self.multilingual2synset(w1, lang)
        s2 = self.multilingual2synset(w2, lang)
        sim_metric = lambda x, y: self.similarity(x, y, name)
        return self.max_synset_similarity(s1, s2, sim_metric)

    @memoized
    def crossl_word_similarity(self,
                               w1,
                               w2,
                               lang1='spa',
                               lang2='eng',
                               name='wpath'):
        """
         Compute cross-lingual word similarity, two words are in different language.
        :param w1: word
        :param w2: word
        :param lang1: language code for word1
        :param lang2: language code for word2
        :param name: name of similarity metric
        :return: semantic similarity score
        """
        s1 = self.multilingual2synset(w1, lang1)
        s2 = self.multilingual2synset(w2, lang2)
        sim_metric = lambda x, y: self.similarity(x, y, name)
        return self.max_synset_similarity(s1, s2, sim_metric)

    def least_common_subsumer(self, c1, c2):
        return c1.lowest_common_hypernyms(c2)[0]

    def synset_ic(self, c):
        return information_content(c, self._ic_corpus)

    def dpath(self, c1, c2, alpha=1.0, beta=1.0):
        lcs = self.least_common_subsumer(c1, c2)
        path = c1.shortest_path_distance(c2)
        path = 1.0 / (1 + path)
        path = path**alpha
        depth = lcs.max_depth() + 1
        depth = depth * 1.0 / (1 + self._wn_max_depth)
        depth = depth**beta
        return math.log(1 + path * depth, 2)

    def wpath(self, c1, c2, k=0.8):
        lcs = self.least_common_subsumer(c1, c2)
        path = c1.shortest_path_distance(c2)
        weight = k**self.synset_ic(lcs)
        return 1.0 / (1 + path * weight)

    def li(self, c1, c2, alpha=0.2, beta=0.6):
        path = c1.shortest_path_distance(c2)
        lcs = self.least_common_subsumer(c1, c2)
        depth = lcs.max_depth()
        x = math.exp(-alpha * path)
        y = math.exp(beta * depth)
        z = math.exp(-beta * depth)
        a = y - z
        b = y + z
        return x * (a / b)

    def path(self, c1, c2):
        return c1.path_similarity(c2)

    def wup(self, c1, c2):
        return c1.wup_similarity(c2)

    def lch(self, c1, c2):
        return c1.lch_similarity(c2)

    def res(self, c1, c2):
        return c1.res_similarity(c2, self._ic_corpus)

    def jcn(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        c1_ic = self.synset_ic(c1)
        c2_ic = self.synset_ic(c2)
        lcs_ic = self.synset_ic(lcs)
        diff = c1_ic + c2_ic - 2 * lcs_ic
        return 1.0 / (1 + diff)

    def lin(self, c1, c2):
        return c1.lin_similarity(c2, self._ic_corpus)
    for category in distribution_dictionary:
        percentage_dictionary[category] = 0
        for word_pair2 in distribution_dictionary[category]:
            percentage_dictionary[category] += word_pair2[1]
            total_words += word_pair2[1]
    for category in percentage_dictionary:
        if total_words != 0:
            percentage_dictionary[category] /= total_words
    print('done...')
    store_dictionary('Instagram_tag_dictionary.json', dictionary)
    return similarity_dictionary, recognition_rate, distribution_dictionary, percentage_dictionary

# Setting up all the necessary preparation
wordlist = set(words.words())
wordnet_lemmatizer = WordNetLemmatizer()
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
my_dictionary = load_dictionary('Instagram_tag_dictionary.json')
wordlist = combine_dictionary(wordlist, my_dictionary)
spider = InstagramSpider()
# get the username from user
username = input('Please give me the user name to analyze: ')
# get the tag data of this user
data = get_data(spider, username)
print('data got...')
print('analyzing tags from user: '******'analyzing words from tags from user: ' + username)
result, rate, distribute_result, percentage_result = analyze_words(my_words=words_from_tags,
                                                                   dictionary=my_dictionary)
Ejemplo n.º 40
0
#import urllib
from stanfordcorenlp import StanfordCoreNLP
import nltk
nltk.download("wordnet")
nltk.download("punkt")
from nltk.tag import pos_tag
import re, string
from nltk.stem import WordNetLemmatizer
l = WordNetLemmatizer()
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from keras.models import load_model
model = load_model('model.h5')
import json
import random
import time
import wikipedia
from flickrapi import FlickrAPI
import os, io, requests
import re
import wikipediaapi
import random

#Load the intents
path = os.path.dirname(os.path.abspath(__file__)) + '/stanford-corenlp-4.2.0'
nlp = StanfordCoreNLP(r'%s' % path)
intents = json.loads(open('intents.json').read())
sentiment = pickle.load(open("SentimentalAnalysis.pkl", "rb"))

#Load the words and classes files using pickle
Ejemplo n.º 41
0
nlp_spacy = spacy.load('en') # Spacy

warnings.filterwarnings('ignore')

stop_words_nltk = stopwords.words('english')

#adding the extra words to nltk stopwords
stop_words_nltk.extend(extra_words)

stop_words_nltk.sort()

#adding the extra words to spacy stopwords as well, in case only one of the two is used
for word in extra_words:
    nlp_spacy.vocab[word].is_stop = True

nltk_lemmatizer = WordNetLemmatizer()

#used to search in nltk stop_words
def BinarySearch(a, x): 
    i = bisect_left(a, x) 
    if i != len(a) and a[i] == x: 
        return i 
    else: 
        return -1
    
#nltk_stemmer = SnowballStemmer()
#nltk_stemmer = LancasterStemmer()
#nltk_stemmer = PorterStemmer()


from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
import nltk
import warnings
warnings.filterwarnings('ignore')


# In[2]:


stop_words = stopwords.words('english')
stop_words = stop_words + list(string.printable)
lemmatizer = WordNetLemmatizer()


# In[3]:


categories= ['misc.forsale', 'sci.electronics', 'talk.religion.misc']


# In[4]:


news_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, download_if_missing=True)
news_data_df = pd.DataFrame({'text' : news_data['data'], 'category': news_data.target})
assert sorted(list(unique(news_data.target))) == sorted([0, 1, 2])
print(news_data_df.head())
Ejemplo n.º 43
0
 def __init__(self):
     self.wnl = WordNetLemmatizer()
     self.tagger = PerceptronTagger()
Ejemplo n.º 44
0
def lemmatize_data(text):
    lemmatizer = WordNetLemmatizer()
    #    data=[lemmatizer.lemmatize(word) for word in text]
    data = " ".join([lemmatizer.lemmatize(word) for word in text])
    return data
Ejemplo n.º 45
0
def notes():
    from PyDictionary import PyDictionary
    from summa import keywords
    from summa.summarizer import summarize
    import nltk
    from nltk.tokenize import sent_tokenize
    from newspaper import Article
    from docx import Document
    url = str(request.form['link'])
    a = Article(url)
    a.download()
    a.parse()
    f = a.text
    b = a.title
    a = a.text
    a = keywords.keywords(a)
    dictionary = PyDictionary()
    a = a.split('\n')
    a1 = []
    for i in a:
        x = i.split(' ')
        for j in x:
            a1.append(j)
    a1.sort(key=lambda s: len(s))
    a1.reverse()
    try:
        a1 = a1[:20]
    except:
        pass
    a = set(a1)
    a = tuple(a1)
    a1 = []
    for i in range(10):
        try:
            a1.append(a[i])
        except:
            pass
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    a = a1
    a1 = []
    for i in a:
        a1.append(lemmatizer.lemmatize(i))
    a = list(set(a1))
    a1 = a
    a = [dictionary.meaning(i) for i in a1]

    z = sent_tokenize(summarize(f, ratio=0.25))

    doc = Document()
    doc.add_heading('Notes for ' + b, 0)
    for i in z:
        doc.add_paragraph(i)
    doc.add_heading('Vocab Words from ' + b, 0)
    for i in range(len(a)):
        c = doc.add_paragraph(str(i + 1) + ') ')
        c.add_run(a1[i]).bold = True
        c.add_run(': ')
        d = str(list(a[i].values()))
        d = d.replace('[', '')
        d = d.replace(']', '')
        c.add_run(d)
        g = doc.add_paragraph('')
        g.add_run('Synonyms for ')
        g.add_run(a1[i].upper() + ': ').bold = True
        from datamuse import datamuse
        api = datamuse.Datamuse()
        s = api.words(ml=a1[i], max=10)
        s1 = []
        for i in s:
            for j in i:
                if j == 'word':
                    s1.append(i[j])
        g.add_run(str(s1).replace('[',
                                  '').replace(']',
                                              '').replace("'",
                                                          '')).italic = True
    whitelist = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
    fileName = b.replace(' ', '')
    fileName = ''.join(filter(whitelist.__contains__, fileName))
    fileName += '.docx'
    doc.save(fileName)
    import cloudmersive_convert_api_client
    from cloudmersive_convert_api_client.rest import ApiException
    configuration = cloudmersive_convert_api_client.Configuration()
    configuration.api_key['Apikey'] = 'f0c513bc-8c00-4491-830e-3e83b015feb6'
    api_instance = cloudmersive_convert_api_client.ConvertDocumentApi(
        cloudmersive_convert_api_client.ApiClient(configuration))
    try:
        # Convert Word DOCX Document to PDF
        api_response = api_instance.convert_document_docx_to_pdf(fileName)
        file = open('static/' + fileName.replace('.docx', '.pdf'), 'wb')
        file.write(api_response)
        file.close()
    except ApiException as e:
        print(
            "Exception when calling ConvertDocumentApi->convert_document_docx_to_pdf: %s\n"
            % e)
    myFile = fileName.replace('.docx', '.pdf')
    myFile2 = myFile
    note = Note(noteFile=str(myFile2), creator=current_user)
    db.session.add(note)
    db.session.commit()
    myFile = url_for('.static', filename=myFile)
    return render_template('notes.html', myFile=myFile)
Ejemplo n.º 46
0
import random
import json
import pickle
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import SGD


lemmatizer = WordNetLemmatizer()

intents = json.loads(open('assistant/bot_comp/intents.json').read())

words = []
classes = []
documents = []
ignore_letters = ['?', '!', '.', ',']

for intent in intents['intents']:
    for pattern in intent['patterns']:
        word_list = nltk.word_tokenize(pattern)
        words.extend(word_list)
        documents.append((word_list, intent['tag']))
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

words = [lemmatizer.lemmatize(word)
         for word in words if word not in ignore_letters]
words = sorted(set(words))
Ejemplo n.º 47
0
def lemmitizer(word):
    wordnet_lemmatizer = WordNetLemmatizer()
    newword = wordnet_lemmatizer.lemmatize(word)
    return newword
Ejemplo n.º 48
0
def uploaded_file(filename, s, e):
    import fitz
    import pytesseract
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
    pdffile = filename
    doc = fitz.open('static' + '/' + filename)
    for i in range(int(s) - 1, int(e)):
        page = doc.loadPage(i)  # number of page
        pix = page.getPixmap()
        output = "outfile" + str(i) + ".png"
        pix.writePNG(output)
    x = ''
    for i in range(int(s) - 1, int(e)):
        x += pytesseract.image_to_string(f'outfile{str(i)}.png')
    from PyDictionary import PyDictionary
    from summa import keywords
    from summa.summarizer import summarize
    import nltk
    from nltk.tokenize import sent_tokenize
    from docx import Document
    f = x
    b = str(filename.replace('.pdf', ''))
    a = x
    a = keywords.keywords(a)
    dictionary = PyDictionary()
    a = a.split('\n')
    a1 = []
    for i in a:
        x = i.split(' ')
        for j in x:
            a1.append(j)
    a1.sort(key=lambda s: len(s))
    a1.reverse()
    try:
        a1 = a1[:20]
    except:
        pass
    a = set(a1)
    a = tuple(a1)
    a1 = []
    for i in range(10):
        try:
            a1.append(a[i])
        except:
            pass
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    a = a1
    a1 = []
    for i in a:
        a1.append(lemmatizer.lemmatize(i))
    a = list(set(a1))
    a1 = a
    a = [dictionary.meaning(i) for i in a1]

    z = sent_tokenize(summarize(f, ratio=0.25))

    doc = Document()
    doc.add_heading('Notes for ' + b, 0)
    for i in z:
        doc.add_paragraph(i)
    doc.add_heading('Vocab Words from ' + b, 0)
    for i in range(len(a)):
        c = doc.add_paragraph(str(i + 1) + ') ')
        c.add_run(a1[i]).bold = True
        c.add_run(': ')
        d = str(list(a[i].values()))
        d = d.replace('[', '')
        d = d.replace(']', '')
        c.add_run(d)
        g = doc.add_paragraph('')
        g.add_run('Synonyms for ')
        g.add_run(a1[i].upper() + ': ').bold = True
        from datamuse import datamuse
        api = datamuse.Datamuse()
        s = api.words(ml=a1[i], max=10)
        s1 = []
        for i in s:
            for j in i:
                if j == 'word':
                    s1.append(i[j])
        g.add_run(str(s1).replace('[',
                                  '').replace(']',
                                              '').replace("'",
                                                          '')).italic = True
    whitelist = set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
    fileName = b.replace(' ', '')
    fileName = ''.join(filter(whitelist.__contains__, fileName))
    fileName += '.docx'
    doc.save(fileName)
    import cloudmersive_convert_api_client
    from cloudmersive_convert_api_client.rest import ApiException
    configuration = cloudmersive_convert_api_client.Configuration()
    configuration.api_key['Apikey'] = 'f0c513bc-8c00-4491-830e-3e83b015feb6'
    api_instance = cloudmersive_convert_api_client.ConvertDocumentApi(
        cloudmersive_convert_api_client.ApiClient(configuration))
    try:
        # Convert Word DOCX Document to PDF
        api_response = api_instance.convert_document_docx_to_pdf(fileName)
        file = open('static/' + fileName.replace('.docx', '.pdf'), 'wb')
        file.write(api_response)
        file.close()
    except ApiException as e:
        print(
            "Exception when calling ConvertDocumentApi->convert_document_docx_to_pdf: %s\n"
            % e)
    myFile = fileName.replace('.docx', '.pdf')
    myFile2 = myFile
    note = Note(noteFile=str(myFile2), creator=current_user)
    db.session.add(note)
    db.session.commit()
    myFile = url_for('.static', filename=myFile)
    return render_template('notes.html', myFile=myFile)
Ejemplo n.º 49
0
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import RegexpStemmer

postammer = PorterStemmer()
print(postammer.stem('dancing'))

from nltk.stem import WordNetLemmatizer

lzr = WordNetLemmatizer()

print(lzr.lemmatize('dancing'))

#but if we want to make it any converting then we use

print(lzr.lemmatize('dancing', pos='v'))
lstemmer = LancasterStemmer()
print(lstemmer('cooking'))
#it just cut down the part what we givw in regexpress
Rexpress = RegexpStemmer('er')
print(Rexpress.stem('cooker'))
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import difflib

STOP_WORDS = set(stopwords.words('english'))

CONFIG_REMOVE_STOP_WORDS = True
CONFIG_STEMMER = SnowballStemmer('english')  # Use None for no stemmer
CONFIG_MAX_FEATURES = 3000  # None for max_features=size of vocab
CONFIG_NGRAM_RANGE = (1, 1)  # (3,3)

TOKEN_STEMMER = SnowballStemmer("english")
TOKEN_LEMMATIZER = WordNetLemmatizer()

CONFIG_FAQ_FILEPATH = "./anon-qrels.txt"
CONFIG_FAQ_CATEGORY_FILEPATH = "./categories.txt"
CONFIG_STRING_SIMILARITY = 0.85


def string_similary(a, b):
    # https://stackoverflow.com/a/1471603
    seq = difflib.SequenceMatcher(a=a.lower(), b=b.lower())
    return seq.ratio()


def are_string_similar(a, b):
    return string_similary(a, b) > CONFIG_STRING_SIMILARITY
Ejemplo n.º 51
0
def files(path):

    f = open(path)
    arr2 = f.read()

    arr2 = re.sub(r'[0-9]',' ', arr2)

    arr2 = re.sub(r'[^a-zA-Z]',' ', arr2)

    arr = []

    for word in arr2.split():
        for val in abb:
            if word == val:
                fin = abb[val]
                break
            else:
                fin = word

        arr.append(fin)

    # arr = []
    # with open(inFile,'r') as file:
    #     for line in file:
    #         for word in line.split():
    #             for val in abb:
    #                 if word == val:
    #                     fin = abb[val]
    #                     break
    #                 else:
    #                     fin = word

    #             arr.append(fin)

    arr = ' '.join(arr)











    newx = arr.replace("\n", " ")





    from nltk.tokenize import RegexpTokenizer
    import string
    ret = re.sub("-\s","-", newx)
    l = nltk.word_tokenize(ret)
    tokens = [x for x in l if not re.fullmatch('[' + string.punctuation + ']+', x)]



    words = [word.lower() for word in tokens]
    med_remove =['a', 'about', 'all', 'almost', 'also', 'although', 'always', 'among', 'an', 'and', 'another', 'any', 'are', 'as', 'at',
             'be', 'because', 'been', 'being', 'between', 'both', 'but', 'by','can', 'could','did', 'do', 'does', 'done', 'due',
              'each', 'either', 'enough', 'especially', 'etc','for', 'found', 'from', 'further','had', 'has', 'have', 'having', 'here', 'how', 'however',
             'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself','just','kg','km','made', 'mainly', 'make', 'may', 'mg', 'might', 'ml', 'mm', 'most', 'mostly', 'must',
             'nearly', 'neither', 'nor','obtained', 'of', 'often', 'on', 'our', 'overall','perhaps', 'pmid','quite','rather', 'really', 'regarding',
             'seem', 'seen', 'several', 'should', 'show', 'showed', 'shown', 'shows', 'significantly', 'since', 'so', 'some', 'such',
             'than', 'that', 'the', 'their', 'theirs', 'them', 'then', 'there', 'therefore', 'these', 'they', 'this', 'those', 'through', 'thus', 'to',
             'upon', 'use', 'used', 'using','various', 'very','was', 'we', 'were', 'what', 'when', 'which', 'while', 'with', 'within', 'without', 'would']



    new_stopwords = set(stopwords.words('english')+med_remove) - {'after','again','before','no','during','not'} 
    words = [w for w in words if not w in new_stopwords]


    lemmatizer = WordNetLemmatizer()
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in words])




    ret = re.sub("\s'","'", lemmatized_output)





    final = ret.rstrip()





    print(final)
Ejemplo n.º 52
0
import pandas as pd 
df = pd.read_csv('C:/Users/abhatt/Desktop/python/data/Consumer_Complaints.csv', encoding='latin-1')
df.shape                                                    # 555,957 x 18
df.dtypes
df = df[['product', 'company', 'consumer_complaint_narrative']]
df = df.rename(columns = {'consumer_complaint_narrative':'narrative'})
df = df[pd.notnull(df['narrative'])]
df['narrative'] = df['narrative'].str.replace('XXXX','')    # Redacted content
df.shape                                                    # 66,806 x 3
df.head()

import time
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

start_time = time.time()                                    # Takes 32.2 minutes!
clean_text = []
# for text in df['narrative']:
    words = regexp_tokenize(text.lower(), r'[A-Za-z]+')
    words = [w for w in words if len(w)>1 and w not in stopwords.words('english')]
    words = [lemmatizer.lemmatize(w) for w in words]
    clean_text.append(' '.join(words))
print('Elapsed clock time: ', (time.time() - start_time)/60, ' minutes')

len(clean_text)
df['clean_text'] = clean_text
df.head()

# Pickle file for later use
Ejemplo n.º 53
0
lStemmer = LancasterStemmer()
sStemmer = SnowballStemmer('english')

n1 = 0
for t in wtokens:
    n1 = n1 + 1
    if n1 < 4:
        print(pStemmer.stem(t), lStemmer.stem(t), sStemmer.stem(t))

print("\n= POS / Lemmatization =\n")

# Apply POS
# Apply Lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

n1 = 0
for t in wtokens:
    n1 = n1 + 1
    if n1 < 6:
        print("Lemmatizer:", lemmatizer.lemmatize(t), ",    With POS=a:", lemmatizer.lemmatize(t, pos="a"))

print("\n= Trigram =\n")
# Apply Trigram
from nltk.util import ngrams

token = nltk.word_tokenize(sentence)

n = 0
for s in stokens:
Ejemplo n.º 54
0
# nltk.download('stopwords')
all_stopwords = stopwords.words('english')

additional_stop_word_list1 = [
    '===', '====', '...', 'one', 'e.g', 'two', 'three', 'four', 'five', 'six',
    'seven', 'eight', 'nine', 'ten', 'e.g.'
]
additional_stop_word_list2 = [
    '|', '=', '&', '{', '}', '[', ']', '>', '<', '?', '!', '$', '%', '#', "'",
    '--', ')', '(', "''", '``', ':', ';', "'s", '10', '6', '7', '8', '9', '5',
    '4', '3', '0', '1', '2', 'j.', 'c.', 'm.', 'a.', '\\', '^', 'x', 'h', 'q',
    'l', 'w', 'g', 'c', 'n', 'f', 'r', 'k', 'p', 'j', 'e', 'b', 'u', 'v', 'le',
    'de', ',', '.', '==', '+', '–', '-', '—', '−', '_'
]

for n in additional_stop_word_list1:
    all_stopwords.append(n)

print(all_stopwords)
print(len(all_stopwords))

tokens = word_tokenize(
    'When the train is stationary or after a certain time (e.g. the time for "route releasing" of the overlap, the release speed calculation shall be based on the distance to the danger point (if calculated on-board). The condition for this change shall be defined for each target as infrastructure data.'
)

lem = WordNetLemmatizer()
for j in tokens:
    l = lem.lemmatize(j)
    print(l)
Ejemplo n.º 55
0
    def lemmatiseVerbs(self):

        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(w, 'v') for w in self.tokens]
Ejemplo n.º 56
0
               Yet we have not done this to any other nation. We have not conquered anyone. 
               We have not grabbed their land, their culture, 
               their history and tried to enforce our way of life on them. 
               Why? Because we respect the freedom of others.That is why my 
               first vision is that of freedom. I believe that India got its first vision of 
               this in 1857, when we started the War of Independence. It is this freedom that
               we must protect and nurture and build on. If we are not free, no one will respect us.
               My second vision for India’s development. For fifty years we have been a developing nation.
               It is time we see ourselves as a developed nation. We are among the top 5 nations of the world
               in terms of GDP. We have a 10 percent growth rate in most areas. Our poverty levels are falling.
               Our achievements are being globally recognised today. Yet we lack the self-confidence to
               see ourselves as a developed nation, self-reliant and self-assured. Isn’t this incorrect?
               I have a third vision. India must stand up to the world. Because I believe that unless India 
               stands up to the world, no one will respect us. Only strength respects strength. We must be 
               strong not only as a military power but also as an economic power. Both must go hand-in-hand. 
               My good fortune was to have worked with three great minds. Dr. Vikram Sarabhai of the Dept. of 
               space, Professor Satish Dhawan, who succeeded him and Dr. Brahm Prakash, father of nuclear material.
               I was lucky to have worked with all three of them closely and consider this the great opportunity of my life. 
               I see four milestones in my career"""

sentences = nltk.sent_tokenize(paragraph)
lemmatizer = WordNetLemmatizer()

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [
        lemmatizer.lemmatize(word) for word in words
        if word not in set(stopwords.words('english'))
    ]
    sentences[i] = ' '.join(words)
Ejemplo n.º 57
0
# Expand contractions (can skip since they are removed in the next step)
doc = contractions.fix(doc).replace('  ', ' ')
print('\nExpanded contractions.')

# Tokenize
tokens = word_tokenize(doc)
print(f'\nTokenizing:\n{tokens[:40]}...')

# Remove stop words
stop_words = set(stopwords.words("english"))
tokens = [w for w in tokens if w not in stop_words]
print('\nRemoved stopwords.')

# Lemmatization
wordnet_lem = WordNetLemmatizer()
tokens_lem = [wordnet_lem.lemmatize(token) for token in tokens]

# Stemming (skip as word meaning is lost)
porter_stem = PorterStemmer()
tokens_stem = [porter_stem.stem(token) for token in tokens_lem]

print(f'\nAfter Lemmatization and removing Stop words:\n{tokens_lem[:40]}...')

# Word frequency
fdist = FreqDist(tokens_lem)
print(f'\nMost common words:\n{fdist}')
print(fdist.most_common(20))
fdist.plot(20, cumulative=False)

# END
Y1 = np.array(Y1.values.tolist())
for i in range(len(X1)):
  documents.append([list(word_tokenize(X1[i])), Y1[i]]) 

X2 = np.array(X2.values.tolist())
Y2 = np.array(Y2.values.tolist())
for i in range(len(X2)):
  documents.append([list(word_tokenize(X2[i])), Y2[i]]) 

documents[0]

from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer 
stem1 = PorterStemmer() 
stem2=LancasterStemmer() 
word_lemma = WordNetLemmatizer() 
stop_english = stopwords.words("english") 
punct="?:!.,;'\"-()"
r_stop = True
useStem = False
useLemma = False
removePuncs = True

for l in range(len(documents)):                   
  label = documents[l][1]                          
  newReview = []                                   
  for w in documents[l][0]:                  
    newWord = w                                    
    if r_stop and (w in stop_english):  
      continue                                    
    if removePuncs and (w in punct):        
Ejemplo n.º 59
0
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle

#Reading the Dataset
df = pd.read_csv('train.csv')

#Dropping the rows of null values
df = df.dropna()

messages = df.copy()
messages.reset_index(inplace=True)

ps = PorterStemmer()
lemma = WordNetLemmatizer()

corpus = []
for i in range(0, len(messages)):
    review = (re.sub('[^a-zA-Z]', ' ', messages['title'][i])).lower()
    review = review.split()

    review = [
        lemma.lemmatize(word, pos='n') for word in review
        if word not in stopwords.words('english')
    ]
    review = ' '.join(review)
    corpus.append(review)

#Extract features with CountVectorizer
cv = CountVectorizer(max_features=2000, ngram_range=(1, 3))
Ejemplo n.º 60
0
# prepare the nltk and download its large datasets
import nltk
nltk.download()
# import the names corpus
from nltk.corpus import names
print(names.words()[:10])
print(len(names.words()))

# import the PorterStemmer algorithm from the appropriate dir
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
porter_stemmer.stem('machines')
# outputs 'machin'
porter_stemmer.stem('learning')
# outputs 'learn'

# import a lemamtization algorithm
from nltk.stem import WordNetLemmatizer
lemamtizer = WordNetLemmatizer()
lemamtizer.lemmatize('machines')
# outputs 'machine'
lemamtizer.lemmatize('learning')
# outputs 'learning', because it acts only on NOUNS!!!