コード例 #1
0
ファイル: OrigReader.py プロジェクト: wencanluo/Summarization
	def AddTopicUnigram(self, feaName,comName, data = None):	
	#need mapping first
		if data is None:
			data =self._data
			
		for i in range(len(data)):	
			t_bigram = self.getEssayCollocation(data, i)
			
			t_uni = list()
			for (a, b) in t_bigram:
				t_uni.append(a)
				t_uni.append(b)
			t_uni = set(t_uni)
			
			comment = data[i][comName]
			tokens = nltk.wordpunct_tokenize(comment)
			tokens = [word.lower() for word in tokens]
		
			#stemming
			if self._stemoption ==True:
				st = PorterStemmer()
				tokens = [st.stem(t) for t in tokens]
				t_uni  = set([st.stem(t) for t in list(t_uni)])
			shared = [w for w in tokens if w in t_uni]
			#normalized
			data[i][feaName] = float(len(shared))/(len(tokens)+0.00001)
コード例 #2
0
ファイル: OrigReader.py プロジェクト: wencanluo/Summarization
	def getDomainUnigram(self, directory = None):		
		collocations = set()  #collocation items
		ewordlists = list() #list of lists of words
		
		#extract words from essays
		if directory is not None:
			doclist = os.listdir(directory)
			for essay in doclist:
				dir_essay  = directory+'/'+essay
				etext = open(dir_essay,'r').read()
				tokens = nltk.wordpunct_tokenize(etext)
				tokens = [word.lower() for word in tokens]
				#stemming
				if self._stemoption ==True:
					st = PorterStemmer()
					tokens = [st.stem(t) for t in tokens]
				
				#extract the collocation for the given essay
				e_bigram = set(Mytext(tokens).collocations())
				collocations = collocations | e_bigram
				ewordlists.append(tokens)
				
		else: # using the mapped essay to calcuate the candidate bigrams
			#need to call mapessay fuction first
			for ins in self._data:
				if ins['essay'] is not None:
					etext = open(ins['essay'],'r').read()
					tokens = nltk.wordpunct_tokenize(etext)
					tokens = [word.lower() for word in tokens]
					#stemming
					if self._stemoption ==True:
						st = PorterStemmer()
						tokens = [st.stem(t) for t in tokens]
				
					#extract the collocation for the given essay
					e_bigram = set(Mytext(tokens).collocations())
					collocations = collocations | e_bigram
					ewordlists.append(tokens)
		
		#get collection of all essays under the specified directory / associated essays
		collection_text = TextCollection(ewordlists)
		
		itemlist = list()
		for (a, b) in collocations:
			itemlist.append(a)
			itemlist.append(b)
			
		itemlist = list(set(itemlist))	
		
		word_idf = []
		for i in range(len(itemlist)):
			word_idf.append((collection_text.idf(itemlist[i]), itemlist[i]))	
		
		word_idf = sorted(word_idf, key = operator.itemgetter(0))
		ave = 0
		if len(word_idf)!=0:
			ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf)
			
		wlist =  [j for (i, j) in word_idf if i<ave]				
		return wlist
コード例 #3
0
def extract_entities(doc):
    print 'extracting entities from %s...' % doc.getFilename()
    nps = list(set([re.sub(' \.', '', re.sub(' -[A-Z]{3}-', '', np).lower()) for np in doc.getAllNodesOfType('NP')]))
    p = PorterStemmer()
    entities = []
    for np in nps:
        try:
            response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content)
        except requests.exceptions.ConnectionError:
            while True:
                time.sleep(15)
                print 'retrying connection...'
                try:
                    response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content)
                    break
                except requests.exceptions.ConnectionError:
                    continue
        docs = response[u'response'][u'docs']
        if len(docs) > 0:
            titles = [docs[0][u'title_en']] + docs[0].get(u'redirect_titles_mv_en', [])
        else:
            titles = []
        if len(titles) > 0:
            titles = [' '.join([p.stem(w.lower()) for w in t.split(' ')]) for t in titles]
        stem_np = ' '.join([p.stem(w) for w in np.split(' ')])
        for title in titles:
            if stem_np == title:
                entities.append(np)
                print np
                break
    #print doc.getFilename(), entities
    return (doc.getFilename(), entities)
コード例 #4
0
ファイル: wikipedia.py プロジェクト: slee17/NLP
def compare_english_simple(article_title):
    """Given a title of an article, returns the number of tokens, types, and stems
    in both the English version and the simple English version."""
    english = extract_wikipedia_page(article_title, "en")
    simple = extract_wikipedia_page(article_title, "simple")
    num_tokens_english = len(english)
    num_tokens_simple = len(simple)
    types_english = count_words(get_words(english))
    types_simple = count_words(get_words(simple))
    
    porter_stemmer = PorterStemmer()
    
    stem_english = defaultdict(int)
    stem_simple = defaultdict(int)
    for key in types_english.keys():
        stem_english[porter_stemmer.stem(key)] += 1
    for key in types_simple.keys():
        stem_simple[porter_stemmer.stem(key)] += 1
    
    print ("Number of Tokens in English " + article_title + ": %d" % num_tokens_english)
    print ("Number of Tokens in Simple English " + article_title + ": %d" % num_tokens_simple)
    print ("Number of Types in English " + article_title + ": %d" % len(types_english))
    print ("Number of Types in Simple English " + article_title + ": %d" % len(types_simple))
    print ("Number of Stems in English " + article_title + ": %d" % len(stem_english))
    print ("Number of Stems in Simple English " + article_title + ": %d" % len(stem_simple))
コード例 #5
0
ファイル: LoadData.py プロジェクト: suket22/CS246
    def parse_questions(self):
        stemmer = PorterStemmer()
        tokenizer = RegexpTokenizer(r'\w+')
        for questions_key in self.rawSamples:
            # Stem the Question Text
            question_text = self.rawSamples[questions_key][0]
            words_array = tokenizer.tokenize(question_text)
            question_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                question_text += (word + " ")
            self.rawSamples[questions_key][0] = question_text

            # Stem the topic names
            topics_text = self.rawSamples[questions_key][2]
            words_array = tokenizer.tokenize(topics_text)
            topics_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                topics_text += (word + " ")
            self.rawSamples[questions_key][2] = topics_text
コード例 #6
0
def extractFeatures(dataSet):
    vector1, vector2 = list(), list()
    
    stemmer = PorterStemmer()
    # Produces list of all unique word stems in the titles in the dataset
    wordBag = list({stemmer.stem(word) for entry in dataSet for word in entry[2].strip().split(" ") if not word in stopwords.words('english')})


    for entry in dataSet:
        genre, isbn, title, authors = entry[0], entry[1].strip(), entry[2].strip(), entry[3].strip()

        wordList, authorList = [word for word in title.split(" ")], [author.strip() for author in authors.split(";")]
        sortedWords = sorted(wordList, key = lambda x: -1*len(x))
        nonStopWords = [word for word in sortedWords if not word in stopwords.words('english')]
        stemmedWords = [stemmer.stem(word) for word in nonStopWords]

        # Quantitative data about the title
        shortestWord = len(nonStopWords[-1])
        longestWord = len(nonStopWords[0])
        meanWord = sum([len(word) for word in nonStopWords])/len(nonStopWords)
        wordSD = (sum([(len(word)-meanWord)**2 for word in nonStopWords])/len(nonStopWords))**.5

        vector1.append([(len(authorList), len(wordList), longestWord, shortestWord, meanWord, wordSD), genre])
        
        # Creates a vector storing whether a word in a dataset occurred in the title
        occurrences = tuple(1 if word in stemmedWords else 0 for word in wordBag)
        
        vector2.append([occurrences, genre])

    return (vector1,vector2)
コード例 #7
0
class PostProcessor:
	def __init__(self):
		"""Loads in Ed and Olivier's domainRules.json file, now converted to a big (7k+ entry) dict object"""
		#import domainRules.json
		from domain_rules import domain_rules
		from tldextract.tldextract import extract
		self.extract = extract
		from nltk.stem.porter import PorterStemmer as PorterStemmer
		self.domain_rules = domain_rules
		#create stemmer
		self.Stemmer = PorterStemmer()
		
	def rerank(self, url, text, results):
		"""Processes classified results"""
		
		#check if the domain exists in domainrules
		domain = self.extract(url)
		domain = domain.domain + "." + domain.suffix
		
		print "Extracted domain: {0}".format(domain)
		
		if domain in self.domain_rules:
			print "found domain"
			if "__ANY" in self.domain_rules[domain]:
				categories = self.domain_rules[domain]['__ANY']
				for cat in categories:
					#stem it
					matchers = [self.Stemmer.stem(cat)]
					if "-" in matchers[0]:
						matchers.append(matchers[0].replace("-", "_"))
					for matcher in matchers:
						for x in range(len(results)):
							print "comparing {0} to {1}".format(matcher, results[x][0])
							if matcher.lower() in results[x][0].lower():
								print "{0} with score {1} contains {2}".format(results[x][0], results[x][1], matcher)
								results[x][1]  = results[x][1] + 1
								print "score is now {0}".format(results[x][1])
		else:
			print "augmenting common words"
			#check for common words
			words = defaultdict(int)
			for result in results:
				tokens = re.findall("[a-z]+", result[0].lower())
				for token in tokens:
					words[token] += 1
			
			#remove single entries
			for k,v in words.iteritems():
				if v > 1:
					for x in range(len(results)):
						matchers = [self.Stemmer.stem(k)]
						if "-" in matchers[0]:
							matchers.append(matchers[0].replace("-", "_"))
						for matcher in matchers:
							if matcher.lower() in results[x][0].lower():
								print "{0} with score {1} contains {2} which has score {3}".format(results[x][0], results[x][1], matcher, v)
								results[x][1] = results[x][1] + v
								print "score is now {0}".format(results[x][1])
		
		return sorted(results, key=lambda x:x[1], reverse=True)
コード例 #8
0
def search(ngrams, index, path, counts, id):

    print 'Searching {}'.format(path.split('/')[-1])

    # If 'Graph!' button was hit with nothing in box
    if ngrams == '':
        return None

    if len(ngrams) > 1:
        ngrams = ngrams.replace(', ', ',').encode('utf-8').lower().split(',')
    else:
        ngrams = ngrams.encode('utf-8').lower()

    ngram_count = {ngram: defaultdict(int) for ngram in ngrams}
    stemmer = PorterStemmer()

    for ngram in ngrams:

        transcripts = list()

        for word in ngram.split():

            # Get stem of word
            word = stemmer.stem(word)

            try:
                # Get set of books the word appears in
                transcripts.append(set([posting[0] for posting in index[word]]))
            except:
                # If the word is not in the index
                pass

        # Get the set of transcripts in which all words in the ngram appear
        transcripts = set.intersection(*transcripts) if len(transcripts) > 0 else set()

        for transcript in transcripts:

            year = int(transcript.split('-')[1])
            month = int(transcript.split('-')[2])
            day = int(transcript.split('-')[3])
            date = datetime(year, month, day)
            locs = []

            # For each transcript, get all of the locations of where the words in the ngram appear
            for word in ngram.split():
                word = stemmer.stem(word)
                locs.extend([posting[1] for posting in index[word] if posting[0] == transcript])

            # Check if the words are next to each other
            # e.g. ngram = 'very high profit margin' and the positions of the words are [[2,10] [3], [4,8,12,29], [5]]
            # This line of code will shift the position of each word over by its distance from the
            # beginning of the ngram to produce new positions [[2,10], [2], [2,6,10,29], [2]]
            # Then I take the intersection of these positions -- if it's not empty,
            # then the ngram appears in the transcript
            locs = [set([int(pos) - i for pos in loc]) for i, loc in enumerate(locs)]
            ngram_count[ngram][date] += len(set.intersection(*locs))

    counts[id] = ngram_count
    print 'Finished searching {}'.format(path.split('/')[-1])
コード例 #9
0
ファイル: extract_features.py プロジェクト: dubstack/asag
def get_bleu_similarity(reference_answers, student_answer):
	porter_stemmer = PorterStemmer()
	reference_answers_tokens = []
	for answer in reference_answers:
		reference_answers_tokens.append(map(lambda x: str(porter_stemmer.stem(x)), answer.split()))
	student_answer = map(lambda x: str(porter_stemmer.stem(x)), student_answer.split())
	weights = [0.25, 0.25]
	return bleu(student_answer,reference_answers_tokens, weights)
コード例 #10
0
ファイル: parser.py プロジェクト: mattea/mattea-utils
def stem(ts):
	global stemmer
	if stemmer is None:
		stemmer = PorterStemmer()
	if type(ts) is list:
		return [stemmer.stem(x) for x in ts]
	else:
		return stemmer.stem(ts)
コード例 #11
0
class PropertyFinder(object):
    def __init__(self):
        self._stemmer = PorterStemmer()

    def __get_property_string_forms(self, property_subtree):
        words = stopwords.words('english')

        property_string_forms = set()
        property_string_forms.add((' '.join(property_subtree.leaves())).lower())
        property_string_forms.add((' '.join([self._stemmer.stem(word) for word in property_subtree.leaves()])).lower())
        property_string_forms.add((' '.join([word for word in property_subtree.leaves() if word not in words])).lower())
        property_string_forms.add((' '.join([self._stemmer.stem(word) for word in property_subtree.leaves() if word not in words])).lower())

        return property_string_forms

    def __fetch_from_wikibase(self, property_string):
        labels = DataBase().search_properties_name(property_string)
        if labels is None:
            return []
        return [label.lower() for label in labels]

    def __fetch_synonyms_and_hypernyms(self, property_string):
        words = set()
        synsets = wordnet.synsets(property_string)
        for synset in synsets:
            words.update([lemma.replace('_', ' ').lower() for lemma in synset.lemma_names()])
            for hypernym in synset.hypernyms():
                words.update([lemma.replace('_', ' ').lower() for lemma in hypernym.lemma_names()])
        return words

    def find_candidates(self, property_subtree):
        if not isinstance(property_subtree, ParentedTree):
            raise AttributeError

        candidates = set(self.__get_property_string_forms(property_subtree))

        new_candidates = set()
        for candidate in candidates:
            for label in self.__fetch_from_wikibase(candidate):
                new_candidates.add(label)
        candidates.update(new_candidates)

        new_candidates = set()
        for candidate in candidates:
            new_candidates.update(self.__fetch_synonyms_and_hypernyms(candidate))
        candidates.update(new_candidates)

        new_candidates = set()
        for candidate in candidates:
            for POS in [wordnet.ADJ, wordnet.ADV, wordnet.NOUN, wordnet.VERB]:
                morphy = wordnet.morphy(candidate, POS)
                if morphy is not None:
                    new_candidates.add(morphy)
        candidates.update(new_candidates)

        return candidates
コード例 #12
0
ファイル: SentenceParser.py プロジェクト: frozstone/concept
    def __weight_tokens(self, mid, nps, sentences, sent_id):
        st          = PorterStemmer()
        sent_target = sentences[sent_id]
        token_id    = [idx for idx, token in enumerate(sent_target.strip().split(" ")) if mid in token][0]

        sent_lengths= [len(s.split(" ")) for s in sentences]

        nps_base = {np:" ".join(st.stem(token) for token in np.split(" ")) for np in nps}
        nps_proc = {}

        for sent_idx, sent in enumerate(sentences):
            sent_stem = " ".join(st.stem(token) for token in sent.split(" "))
            for np_ori, np in nps_base.iteritems():
                if np_ori not in nps_proc: nps_proc[np_ori] = {}

                if "dist_sent" not in nps_proc[np_ori] or abs(sent_idx - sent_id) < nps_proc[np_ori]["dist_sent"]:
                    #always update the info
                    if np not in sent_stem: 
                        continue
                    np_idx      = sent_stem.rindex(np)
                    np_token_idx= len(sent_target[:np_idx].strip().split(" "))
                    dist_start  = len(sent_stem[:np_idx].strip().split(" "))
                    dist_end    = len(sent_stem[np_idx+len(np):].strip().split(" "))

                    dist_sent   = abs(sent_idx - sent_id)
                    dist_token  = -1

                    if dist_sent == 0:
                        if mid in np_ori:
                            dist_token = 0
                        elif np_token_idx < token_id:
                            dist_token = token_id - np_token_idx - (len(np.split(" ")) - 1) - 1
                        elif np_token_idx > token_id:
                            dist_token = np_token_idx - token_id - 1
                    elif sent_idx < sent_id: 
                        dist_token = dist_end + sum(sent_lengths[sent_idx+1:sent_id]) + token_id
                    elif sent_idx > sent_id:
                        dist_token = (len(sent_target.strip().split(" "))-1-token_id) + sum(sent_lengths[sent_id+1:sent_idx]) + dist_start

                    nps_proc[np_ori]["dist_sent"]  = dist_sent
                    nps_proc[np_ori]["dist_token"] = dist_token

                np_count = sent_stem.count(np)
                nps_proc[np_ori]["tf"] = (nps_proc[np_ori].get("tf") or 0) + np_count

        nps_weight = {}
        for np, vals in nps_proc.iteritems():
            term1 = self.__alpha * self.__gaussian_weight(vals["dist_token"], self.__var_d)
            term2 = self.__beta  * self.__gaussian_weight(vals["dist_sent"],  self.__var_s)
            term3 = self.__gamma * vals["tf"]
            nps_weight[np] = (term1 + term2 + term3) / (self.__alpha + self.__beta + self.__gamma)
        return nps_weight
コード例 #13
0
ファイル: IR.py プロジェクト: pranavbahl2308/VectorSpaceModel
	def preProcessing(self,raw,fileName):
		cachedStopWords = stopwords.words("english")
		stemmer = PorterStemmer()
		text = ' '.join([word for word in raw.split() if word not in cachedStopWords])
		tokens = nltk.word_tokenize(text.lower())
		stemmed = []
		directory = os.getcwd()+"/pre-process/" 
		if not os.path.exists(directory):
			os.makedirs(directory)
		test = open(directory+re.sub('\.htm$', '', fileName)+".txt","w")
		for item in tokens:
			stemmed.append(stemmer.stem(item))
			test.write(stemmer.stem(item)+' ')
		test.close()
		return stemmed
コード例 #14
0
ファイル: search.py プロジェクト: kaiserahmed/CS3245
def search(dictionary_file, postings_file, query_file, output_file):
    """ Entry point to the program """

    stemmer = PorterStemmer()
    with open(dictionary_file, "rb") as dfile:
        dictionary = pickle.loads(dfile.read())

    with open(query_file, "rb") as qfile:
        with open(postings_file, "rb") as pfile:
            for query in qfile:
                print "query: ", query
                prefix = parser.to_polish_notation(query)
                print "prefix: ", prefix
                processed = []
                for token in prefix:
                    if parser.is_operand(token):
                        token = stemmer.stem(token).lower()
                    processed.append(token)

                print "processed: ", processed
                query = parser.process_query(processed)
                print "query: ", query
                result = execute_query(query, dictionary, pfile)

                print result
コード例 #15
0
ファイル: textual_features.py プロジェクト: gsi-upm/gsitk
	def createLDAModel(texts, n_topics, n_passes):
		"""Generates a LDA model from an array of texts
		"""
		tokenizer = RegexpTokenizer(r'\w+')
		#Create EN stop words list
		en_stop = get_stop_words('en')
		#Create p_stemmer of class PorterStemmer
		p_stemmer = PorterStemmer()

		texts_ = []

		# loop through document list
		for i in texts:
		    
		    # clean and tokenize document string
		    raw = i.lower()
		    tokens = tokenizer.tokenize(raw)
		    
		    # remove stop words from tokens
		    stopped_tokens = [i for i in tokens if not i in en_stop]
		    # stem tokens
		    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
		    # add tokens to list
		    texts_.append(stemmed_tokens)

		# turn our tokenized documents into a id <-> term dictionary
		dictionary = corpora.Dictionary(texts_)

		# convert tokenized documents into a document-term matrix
		corpus = [dictionary.doc2bow(text) for text in texts_]

		# generate LDA model
		ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes)

		return(ldamodel)
コード例 #16
0
def tokenStem(words):
    words = words.strip('[').strip(']').lower() #remove brackets and lowercase
    words = re.sub('[(){}<>:,.!?\'"]', '', words)
    stemmer = PorterStemmer()
    stops = stopwords.words('english')
    output = [stemmer.stem(token) for token in wordpunct_tokenize(words) if token not in stops ] #stem words
    return " ".join(output) #merge into strings
コード例 #17
0
ファイル: rake_stem.py プロジェクト: neethukurian/keyextract
def main():

    rake=RAKE.Rake('SmartStoplist.txt')
    fp=open(input_file,'r')
    text=fp.read()
    text=text_clean(text)
    """wnl=WordNetLemmatizer()
    text=' '.join([wnl.lemmatize(i.strip()) for i in nltk.word_tokenize(text)])"""
    porter_stemmer=PorterStemmer()
    text=' '.join([porter_stemmer.stem(i.strip()) for i in nltk.word_tokenize(text)])
    keywords=rake.run(text)
   # print keywords

    with open(key_score_file,'wb') as out:
        csv_out=csv.writer(out)
        csv_out.writerow(['KEYWORD','SCORE'])
        for row in keywords:
            if row[1]>0:
                csv_out.writerow(row)


    unibitrigram_list=[]
    unibitrigram_list=generate_unibitrigrams(key_score_file)
    #print unibitrigram_list
    #ngram_freq=[]
    ngram_freq=Counter(unibitrigram_list)
    sorted_ngram_freq=sorted(ngram_freq.items(),key=lambda x:x[1],reverse=True )
    print ngram_freq
    with open('bcom_ngramfr_stem.csv','wb') as nf_csv:
        csv_wr=csv.writer(nf_csv)
        for item in sorted_ngram_freq:
            if ((item[0]!='')):
                csv_wr.writerow(item)
コード例 #18
0
def Tokenize(TextData):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = list()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # clean and tokenize document string
    raw = TextData.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    tokens = stemmed_tokens

    TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt")
    fp = open(TOKENIZEDTEXT_FILE, "w")
    print(TOKENIZEDTEXT_FILE)
    # pickle.dump(tokens, fp)
    fp.write(str(tokens))
    fp.close()
コード例 #19
0
def get_stemmed_separate(indeed_reviews_db, glassdoor_reviews_db):
    separate = get_separate_reviews(indeed_reviews_db, glassdoor_reviews_db)
    stemmer = PorterStemmer()
    stemmed_reviews = []
    for review in separate:
        stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())]))
    return stemmed_reviews
コード例 #20
0
ファイル: documentrep.py プロジェクト: SoAG/articleclustering
	def create_bag_of_words(self):
		"""Create a BagOfWords for the document. Performs named entity recognition, stemming and stopword removal. """
		stemmer = PorterStemmer()
		nes = []
		tagged_text = self.ner_tagger.get_entities(self.content.encode('utf-8'))
		for key in tagged_text.keys():
			if key != 'O':
				nes += tagged_text[key]
		for n in nes:
			self.bag_of_words.add_stem_word(n, n)
			Document.vocabulary.add_stem_word(n, n)

		wo_named = re.sub('|'.join(nes), '', self.content)

		words = re.findall(r'\w+', wo_named,flags = re.UNICODE | re.LOCALE) 
		for wordo in words:
			word = wordo.rstrip(r'\n')
			if word.lower() not in stopwords:
				w = stemmer.stem(word.lower())
				self.bag_of_words.add_stem_word(w, word)
				Document.vocabulary.add_stem_word(w, word)

		for word in self.bag_of_words.get_all_words():
			if word in Document.document_word_frequency:
				Document.document_word_frequency[word] += 1
			else:
				Document.document_word_frequency[word] = 1
コード例 #21
0
ファイル: search.py プロジェクト: drewblelow/cs3245-hw3
def evaluate(query):
	global DICTIONARY
	word_score = {}
	seek_pos = open(postings_file, 'r')
	seek_pos.seek(0,0)
	words = query.split()
	stemmer = PorterStemmer()
	words = [element.lower() for element in words]
	for item in words:
		word = stemmer.stem(item)
		if word not in word_score:	
			if word in DICTIONARY:
				seek_pointer = DICTIONARY[word]
				seek_pos.seek(int(seek_pointer))
				line = seek_pos.readline()
				seek_pos.seek(0,0)
				post_list = line.split()
				score = score_documents(post_list)
				word_score[word] = score
			else:
				#not encountered, score of 0
				word_score[word] = []
		#else duplicate, skip word
	result = score_query(word_score)
	return result
def stemText(s):
	ps = PorterStemmer()
	stemmedText = []
	for word in s:
		stemmedText.append(ps.stem(word))
		
	return stemmedText
コード例 #23
0
def clean_split_stem(rawstring):
    stop = stopwords.words('english')
    out_str = rawstring.split()
    porter = PorterStemmer()
    out_str = [porter.stem(word) for word in out_str]
    out_str = [word for word in out_str if word not in stop]
    return out_str
コード例 #24
0
def lda(data):
	data = get_only_text(data)
	only_tweet = data
	length = len(only_tweet)
	length = min(20,length)
	for i in xrange(0,length):
		print i
		print only_tweet[i]
	return
	
	tokenizer = RegexpTokenizer(r'\w+')
	en_stop = get_stop_words('en')
	p_stemmer = PorterStemmer()

	length = len(only_tweet)
	length = min(20,length)
	total_texts = []
	for i in xrange(0,length):
		print only_tweet[i]
		print 
		to_lower = only_tweet[i].lower()
		tokens = tokenizer.tokenize(to_lower)
		stopped_tokens = [k for k in tokens if not k in en_stop]
		texts = [p_stemmer.stem(k) for k in stopped_tokens]
		total_texts.append(texts)

	dictionary = corpora.Dictionary(total_texts)
	corpus = [dictionary.doc2bow(text) for text in total_texts]

	ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
	result =  ldamodel.print_topics(num_topics=2, num_words=1)
	for i in result:
		print i
コード例 #25
0
def PreProcessing(line):
    unigrams = line.split()
    word_list = [word.lower() for word in unigrams if word.lower() not in stopwords]
    st = PorterStemmer()
    word_list = [st.stem(word) for word in word_list if word]
    vocab = [word for word in word_list if word not in stopwords]
    return vocab
コード例 #26
0
def preprocess_text(raw):
    lower_raw = raw.lower()
    tokens = nltk.word_tokenize(lower_raw)
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    port = PorterStemmer() #This extracts the important root of a word. eg. parsing -> pars
    stemmed = [port.stem(item) for item in tokens]
    return stemmed
コード例 #27
0
ファイル: nbayes_classify.py プロジェクト: USStateDept/Polis
def cleanData(doc_list):
  # tokenize
  tokens = []
  for doc in doc_list:
    text_l = []
    ws_split = re.split(split_on, doc)
    for w in ws_split:
      # remove URLs and empty strings
      if not (url_pat.match(w) or w == u''):
        text_l.append(w)
  
    # rejoin text and 'properly' tokenize
    text = " ".join(text_l)
    text_l = nltk.word_tokenize(text)
    
    # stop words 
    text_l = [ w.lower() for w in text_l if w.lower() not in stops]
  
    # stemming
    p_stemmer = PorterStemmer()
    text_l = [p_stemmer.stem(t) for t in text_l]
    
    ## append cleaned text to list
    tokens.append(text_l)
  return tokens
コード例 #28
0
ファイル: make-lm-dev.py プロジェクト: kedz/cuttsum
def tokenize(docs, norm, stop, ne, central_per=None, central_loc=None, central_org=None):

    if stop:
        with open("stopwords.txt", "r") as f:
            sw = set([word.strip().decode("utf-8").lower() for word in f])

    if norm == "stem":
        from nltk.stem.porter import PorterStemmer
        stemmer = PorterStemmer()

    all_toks = []
    for doc in docs:
        toks = []
        for sent in doc:
                if norm == "lemma":
                    stoks = [unicode(tok.lem).lower() for tok in sent]
                elif norm == "stem":
                    stoks = [stemmer.stem(unicode(tok).lower())
                             for tok in sent]
                else:
                    stoks = [unicode(tok).lower() for tok in sent]
                if stop:
                    toks.extend([tok for tok in stoks if tok not in sw])
                else:
                    toks.extend(stoks)
        toks = [tok for tok in toks if len(tok) < 50]
        #if len(toks) == 0: continue
        string = u" ".join(toks).encode("utf-8")
        #print string
        all_toks.append(string)
    return all_toks
コード例 #29
0
def text_process(text):
    '''
    Takes in a string of text, then performs the following
    1. Tokenizes and removes punctuation
    2. Removes stopwords
    3. Stems
    4. Returns a list of the cleaned text
    '''
    if(pd.isnull(text)):
        return []
    
    # Tokenize 
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed = tokenizer.tokenize(text)
    
    # Removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # Stemming
    porterStemmer = PorterStemmer()
    
    text_processed = [porterStemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
        
    except:
        pass
    
    return " ".join(text_processed)
コード例 #30
0
class StemmerTokenizer(object):

    def __init__(self): 
        self.stemmer = PorterStemmer()

    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in word_tokenize(doc)]
コード例 #31
0
def get_data():
    from collections import defaultdict as dd
    from nltk import word_tokenize
    from nltk.stem.porter import PorterStemmer
    import pickle

    stemmer = PorterStemmer()
    data = pickle.load(open("../data/person_pub_data.pkl", "rb"))
    title_df = dd(int)
    venue_df = dd(int)
    aff_df = dd(int)

    for d in data:
        for item in d["pubs"]:
            for w in word_tokenize(item["title"].lower()):
                title_df[stemmer.stem(w)] += 1
            if item["venue"]:
                for w in word_tokenize(item["venue"].lower()):
                    venue_df[stemmer.stem(w)] += 1
            for a in item["authors"].values():
                if a["aff"]:
                    for w in word_tokenize(a["aff"].lower()):
                        aff_df[stemmer.stem(w)] += 1

    with open("aff_vocab.pkl", "wb") as f_out:
        pickle.dump(list(aff_df.items()), f_out)
    with open("venue_vocab.pkl", "wb") as f_out:
        pickle.dump(list(venue_df.items()), f_out)
    with open("title_vocab.pkl", "wb") as f_out:
        pickle.dump(list(title_df.items()), f_out)

    pub_author_map = []
    authors_map = []
    for d in data:
        labels = dd(list)
        for item in d["pubs"]:
            labels[item["label"]].append(item["authors"][item["offset"]]["idx"])
            for a in item["authors"].values():
                pub_author_map.append((item["idx"], a["idx"]))
        for l in labels:
            for i in range(len(labels[l])):
                for j in range(i+1, len(labels[l])):
                    authors_map.append((labels[l][i], labels[l][j]))
    with open("pub_author_map.pkl", "wb") as f_out:
        pickle.dump(pub_author_map, f_out)
    with open("authors_map.pkl", "wb") as f_out:
        pickle.dump(authors_map, f_out)

    attr = [None for i in range(25102)]
    for d in data:
        for pub in d["pubs"]:
            title, venue = pub["title"], pub["venue"]
            if title:
                title = [stemmer.stem(w) for w in word_tokenize(pub["title"].lower())]
            if venue:
                venue = [stemmer.stem(w) for w in word_tokenize(pub["venue"].lower())]
            attr[pub["idx"]] = ("pub", title, venue)
            for a in pub["authors"].values():
                name, aff = a["name"], a["aff"]
                if name:
                    name = [stemmer.stem(w) for w in word_tokenize(a["name"].lower())]
                if aff:
                    aff = [stemmer.stem(w) for w in word_tokenize(a["aff"].lower())]
                attr[a["idx"]] = ("author", name, aff)
    with open("attr.pkl", "wb") as f_out:
        pickle.dump(attr, f_out)
def LDA_Topic_Clustering(corp, reading_weight, new_model, class_num,
                         LDA_passes, x, y):

    # ------------------- 1 Stop words----------------------
    #raw = re.sub("\d+","",raw)
    #raw = raw.replace("’","'")
    English_stop_words = get_stop_words('en')
    My_list = [
        ".'", ".']", "]']", "\'\'", 'one', 'two', 'three', 'four', 'five',
        'six', 'seven', 'eight', 'nine', 'ten', '://', 'http', 'www', 'com',
        'don', 'pre', 'paid', 'must', 'tcan', 'twhen', 'twhat', 'via', 'are',
        'will', 'said', 'can', 'near', 'and', 'the', 'i', 'a', 'to', 'it',
        'was', 'he', 'of', 'in', 'you', 'that', 'but', 'so', 'on', 'up', 'we',
        'all', 'for', 'out', 'me', 'him', 'they', 'says', 'got', 'then',
        'there', 'no', 'his', 'as', 'with', 'them', 'she', 'said', 'down',
        'see', 'had', 'when', 'about', 'what', 'my', 'well', 'if', 'at',
        'come', 'would', 'by', 'one', 'do', 'be', 'her', "didn't", 'jim',
        'get', "don't", 'time', 'or', 'right', 'could', 'is', 'went', "warn't",
        "ain't", 'good', 'off', 'over', 'go', 'just', 'way', 'like', 'old',
        'around', 'know', 'de', 'now', 'this', 'along', 'en', 'done',
        'because', 'back', "it's", 'tom', "couldn't", 'ever', 'why', 'going',
        'little', 'some', 'your', 'man', 'never', 'too', 'more', 'say', 'says',
        'again', 'how', 'here', 'tell', 'message', 'posted', 'need', 'needs',
        'someone', 'government', 'intelligence', 'report'
    ]

    stoplist_1 = set(
        'a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9 0'
        .split(' '))  # Create a set of enlighs alphabets
    stoplist_2 = set(English_stop_words)
    stoplist_3 = set(
        'es la . , . <br> <br><br> br > : >< < .< { } [ ] ( ) ,\'\'  ." ` " ? ! - \u201d< \u201d .\u201d \u201d u201d \u2019 \xe9 !< >!'
        .split(' '))  # Create a set
    #stoplist_33 = set(' .' .'] '.split(' ')) # Create a set
    stoplist_4 = set(My_list)

    stoplist = stoplist_1 | stoplist_2 | stoplist_3 | stoplist_4
    # ------------------- 2 tokenizer ----------------------

    stopped_tokens = [
        [
            word
            for word in WordPunctTokenizer().tokenize(str(document).lower())
            if ((word not in stoplist) & (word != u'.\u201d<')
                & (word != u'.\u201d') & (word != u'\u201c') & (len(word) > 2)
                & (is_int(word) == False))
        ]  #  & (is_int(word) == False)  & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+',	 word) == False) ) ]
        for document in corp
    ]

    # ------------------- 3 Stemming and Count word frequencies -------------------
    p_stemmer = PorterStemmer()
    stemmer = {}
    texts = []
    texts_set = []  #set()
    de_stemmer = {}

    for stopped_token in stopped_tokens:
        stemmed_texts = [p_stemmer.stem(i) for i in stopped_token]
        texts_set += [stemmed_texts]
    #texts_set = stopped_tokens    # Without stemmer

    for j in range(0, len(texts_set)):
        for i in range(0, len(texts_set[j])):
            if not texts_set[j][i] in de_stemmer:
                de_stemmer[texts_set[j][i]] = stopped_tokens[j][
                    i]  # Save it later for de_stemmer!

    frequency = defaultdict(int)
    for text in texts_set:
        for token in text:
            frequency[token] += 1

    # Only keep words that appear more than once
    processed_corpus = [[token for token in text if frequency[token] > 0]
                        for text in texts_set]
    #print processed_corpus
    #return 0
    # ------------------- 4 Dictionary and TF-IDF Vectors -------------------
    my_dictionary = corpora.Dictionary(processed_corpus)
    ids2words = my_dictionary.token2id
    bow_corpus = [my_dictionary.doc2bow(text) for text in processed_corpus]

    # ------------------- Add user interactions weights ----------------------
    i = 0
    new_corp = []
    for each_doc in bow_corpus:
        new = []
        for each_word in each_doc:
            new.append(
                (each_word[0], each_word[1] * (1 + reading_weight[i]))
            )  # new.append(each_word[0], float(each_word[1]) * (1+reading_weight[i]))
            j += 1
        new_corp.append(new)
        i += 1
    # train the model
    Text_tfidf = models.TfidfModel(new_corp)

    # all_vectors = Text_tfidf[new_corp] #bow_corpus]   # Gives representative vectors

    # print "\n bow_corpus: ", bow_corpus
    # print "\n new weighted Docs: ", new_corp
    # print "\n TF-IDF vectors: ", all_vectors
    # print "\n TF-IDF size: ", len(all_vectors)
    # print "\n TF-IDF zero: ", all_vectors[0]
    # for doc in all_vectors:
    # print "each: ", doc

    # print "\n The End "

    # ------------------- 5 LDA Model and -------------------

    if os.path.isfile("./LDAmodels/LDAmodel_dataset" + str(x) + "_P" + str(y) +
                      "_class" + str(class_num) + ".lda") == 0 or (
                          new_model == 1):  # Do you want to train the model?
        print "\n LDA Model Training..."
        Text_lda = models.LdaModel(new_corp,
                                   id2word=my_dictionary,
                                   num_topics=class_num,
                                   passes=LDA_passes)  #  with out TF-IDF model
        Text_lda.save("./LDAmodels/LDAmodel_dataset" + str(x) + "_P" + str(y) +
                      "_class" + str(class_num) +
                      ".lda")  # same for tfidf, lsa, ...
    else:
        print "\n LDA Model Loading..."
        Text_lda = models.LdaModel.load("./LDAmodels/LDAmodel_dataset" +
                                        str(x) + "_P" + str(y) + "_class" +
                                        str(class_num) + ".lda")

    # ------------------- 6 Document Vectors and Classification -------------------

    counter = []
    doc_topics = []

    for each in range(0, class_num):
        counter.append(0)

    for index, document in enumerate(
            all_vectors):  # Each documents probability to calss
        # infer topic distribution for each document
        doc_topics.append(Text_lda.get_document_topics(
            document))  # , minimum_probability=0.19)
        #        No_Topic = 1
        new_list = []
        for each_topic in doc_topics[-1]:
            new_list.append(each_topic[1])

        t_index, value = max(enumerate(new_list), key=operator.itemgetter(1))

        # print "\n index: ", t_index
        # print "\n doc_topics: ", doc_topics
        counter[t_index] += 1
    # ------------------- 7 Create a bag for topic keywords -------------------
    topicWordTags = []
    topicWordTags2 = []
    topicWordTags3 = []

    finalBag = []
    for each in range(0, class_num + 1):
        topicWordTags.append(set())
        topicWordTags2.append([])
        topicWordTags3.append([])  #set())
        finalBag.append('')
    # print "\n Topic word empty: ", topicWordTags

    # ------------------- 8 Topic summary output -------------------
    output_topics = Text_lda.show_topics(
        num_topics=class_num, num_words=15,
        formatted=False)  # To review topics and terms individually

    return finalBag, topicWordTags, topicWordTags2, topicWordTags3, de_stemmer, ids2words, all_vectors, Text_lda, my_dictionary, Text_tfidf, output_topics, de_stemmer, doc_topics
コード例 #33
0
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(0, dataset.shape[0]):
    review = re.sub('[^a-zA-z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [
        ps.stem(word) for word in review if not word in set(all_stopwords)
    ]
    review = ' '.join(review)
    corpus.append(review)

#Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

#Splitting the dataset into training and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,
コード例 #34
0
 def Stem_words(self,words):
     stemmer = PorterStemmer()
     Stemmed_words = [stemmer.stem(w) for w in words]
     return " ".join(Stemmed_words)
コード例 #35
0
sentences= sent_tokenize(data)

from nltk import word_tokenize
token=word_tokenize(data)

words=[word for word in token if word.isalpha()]

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)
from nltk.stem.porter import PorterStemmer


porter = PorterStemmer()
words=[word for word in words if not word in stop_words]
stemmed = [porter.stem(word) for word in words]

from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width = 1000, height = 500).generate(" ".join(stemmed))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
str1 = ''.join(stemmed)
type(str1)


#bigram and trigram
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
        line__ = [data_.get(h) for h in header]
        data.append(line__)
        df = pd.DataFrame(data, columns=header)
    return df

df = open_file('Cell_Phones_and_Accessories_5.json')
text_list = df['reviewText'].values.tolist()Pandas library

ttagged_list = []
ptagged_list = []
ltagged_list = []
stagged_list = []

for text in text_list:
    tokens = nltk.word_tokenize(text)
    portstemmed = [stemmer_porter.stem(token) for token in tokens]
    lancasterstemmed = [stemmer_lancaster.stem(token) for token in tokens]
    SBstemmed = [stemmer_snowball.stem(token) for token in tokens]
    ptagged = nltk.pos_tag(portstemmed)
    ltagged = nltk.pos_tag(lancasterstemmed)
    stagged = nltk.pos_tag(SBstemmed)
    ptagged_list.append(ptagged)
    ltagged_list.append(ltagged)
    stagged_list.append(stagged)

ptagged_list = np.expand_dims(np.asarray(ptagged_list),0)
ltagged_list = np.expand_dims(np.asarray(ltagged_list),0)
stagged_list = np.expand_dims(np.asarray(stagged_list),0)
print (ptagged_list.shape)
print (ltagged_list.shape)
print (stagged_list.shape)
コード例 #37
0
ファイル: app.py プロジェクト: amitabh27/Ideathon
def lda(user_last_read_article):
    
    #word_tokenizing
    global sent_to_words

    def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
    data_words = list(sent_to_words(data))


    p_stemmer = PorterStemmer()
    en_stop = get_stop_words('en')

    data_lemmatized = []

    for i in data_words:
        tokens = i
        stopped_tokens = [i for i in tokens if not i in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
        data_lemmatized.append(' '.join(stemmed_tokens))
    
    global vectorizer,data_vectorized,lda_model,lda_output,best_lda_model
    
    if training == 1:
        vectorizer = CountVectorizer(analyzer='word',       
                                 #min_df=10,                        # minimum reqd occurences of a word 
                                 stop_words='english',             # remove stop words
                                 lowercase=True,                   # convert all words to lowercase
                                 token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                                 # max_features=30000,             # max number of uniq words
                                )

        data_vectorized = vectorizer.fit_transform(data_lemmatized)

        #Building LDA model
        lda_model = LatentDirichletAllocation(n_components=8,               # Number of topics
                                          max_iter=20,               # Max learning iterations
                                          learning_method='online',   
                                          random_state=100,          # Random state
                                          batch_size=2,            # n docs in each learning iter
                                          evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                          n_jobs = -1,               # Use all available CPUs
                                         )
        lda_output = lda_model.fit_transform(data_vectorized)


        search_params = {'n_components': [3,5,7,9], 'learning_decay': [.5, .7, .9]}

        # Init the Model
        lda = LatentDirichletAllocation()

        # Init Grid Search Class
        model = GridSearchCV(lda, param_grid=search_params)

        # Do the Grid Search
        model.fit(data_vectorized)

        # Printing params for best model among all the generated ones
        # Best Model
        best_lda_model = model.best_estimator_
        
        outfile = open('vectorizer.pickled','wb')
        pickle.dump(vectorizer,outfile)
        outfile.close()
        outfile = open('data_vectorized.pickled','wb')
        pickle.dump(data_vectorized,outfile)
        outfile.close()
        outfile = open('lda_output.pickled','wb')
        pickle.dump(lda_output,outfile)
        outfile.close()
        outfile = open('lda_model.pickled','wb')
        pickle.dump(lda_model,outfile)
        outfile.close()
        outfile = open('best_lda_model.pickled','wb')
        pickle.dump(best_lda_model,outfile)
        outfile.close()
        
    else :
        
        infile = open('vectorizer.pickled','rb')
        vectorizer = pickle.load(infile)
        infile.close()
        infile = open('data_vectorized.pickled','rb')
        data_vectorized = pickle.load(infile)
        infile.close()
        infile = open('lda_output.pickled','rb')
        lda_output = pickle.load(infile)
        infile.close()
        infile = open('lda_model.pickled','rb')
        lda_model = pickle.load(infile)
        infile.close()
        infile = open('best_lda_model.pickled','rb')
        best_lda_model = pickle.load(infile)
        infile.close()


    #dominant topic in each doc

    # Create Document - Topic Matrix
    lda_output = best_lda_model.transform(data_vectorized)

    # column names
    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

    # index names
    docnames = ["Doc" + str(i) for i in range(len(data))]

    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic

    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")

    # defining topic keywords 
    # Topic-Keyword Matrix
    df_topic_keywords = pd.DataFrame(best_lda_model.components_)

    # Assign Column and Index
    df_topic_keywords.columns = vectorizer.get_feature_names()
    df_topic_keywords.index = topicnames

    # View
    df_topic_keywords.head()

    #get top 15 keywords for each doc


    # Show top n keywords for each topic
    def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
        keywords = np.array(vectorizer.get_feature_names())
        topic_keywords = []
        for topic_weights in lda_model.components_:
            top_keyword_locs = (-topic_weights).argsort()[:n_words]
            topic_keywords.append(keywords.take(top_keyword_locs))
        return topic_keywords

    topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)  

    #Given a piece of text, predicting the topic in document

    def predict_topic(text):
        global sent_to_words

        mytext_2 = list(sent_to_words(text))
        #print(mytext_2)

        mytext_3 =[]

        for i in mytext_2 :

            tokens=i
            stopped_tokens = [i for i in tokens if not i in en_stop]
            #print(stopped_tokens)
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
            #print(stemmed_tokens)
            mytext_3.append(' '.join(stemmed_tokens))
            #print(mytext_3)

            mytext_4 = vectorizer.transform(mytext_3)

        topic_probability_scores = best_lda_model.transform(mytext_4)
        topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()
        return topic, topic_probability_scores



    #Given a piece of Text, predicting the documents that are related to it most closely

    from sklearn.metrics.pairwise import euclidean_distances

    def similar_documents(text, doc_topic_probs, documents = data, top_n=2, verbose=False):
        topic, x  = predict_topic(text)
        dists = euclidean_distances(x.reshape(1, -1), doc_topic_probs)[0]
        doc_ids = np.argsort(dists)[:top_n]
        return doc_ids, np.take(documents, doc_ids)

    arr=[]
    arr.append(user_last_read_article)
    doc_ids, docs = similar_documents(text=arr, doc_topic_probs=lda_output, documents = data, top_n=2, verbose=True)
    result_api.append(doc_ids[0])
    result_api.append(doc_ids[1])
    print(result_api)
コード例 #38
0
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [
        ps.stem(word) for word in review
        if not word in set(stopwords.words('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)

# Creating a Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
コード例 #39
0
class BidafQaPredictor(Predictor):
    """
    Converts the QA JSON into an instance that is expected by BiDAF model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._stemmer = PorterStemmer()
        self._stop_words = set(stopwords.words('english'))

    @overrides
    def _json_to_instance(
            self,  # type: ignore
            json_dict: JsonDict) -> Instance:
        # pylint: disable=arguments-differ
        """
        Expects JSON that looks like ``{"question": { "stem": "..."}, "para": "..."}``.
        """
        question_text = json_dict["question"]["stem"]
        passage_text = json_dict["para"]
        return self._dataset_reader.text_to_instance(question_text,
                                                     passage_text)

    @overrides
    # def predict_json(self, inputs: JsonDict, cuda_device: int = -1):
    def predict_json(self, inputs: JsonDict):
        instance = self._json_to_instance(inputs)
        # outputs = self._model.forward_on_instance(instance, cuda_device)
        outputs = self._model.forward_on_instance(instance)
        json_output = inputs
        span_str = outputs["best_span_str"]
        # If the file has an answer key, calculate the score
        if "answerKey" in json_output:
            answer_choices = json_output["question"]["choices"]
            # Score each answer choice based on its overlap with the predicted span.
            for choice in answer_choices:
                choice_text = choice["text"]
                choice_score = self._overlap_score(choice_text, span_str)
                choice["score"] = choice_score

            # Get the maximum answer choice score
            max_choice_score = max(answer_choices,
                                   key=itemgetter("score"))["score"]
            # Collect all answer choices with the same score
            selected_answers = [
                choice["label"] for choice in answer_choices
                if choice["score"] == max_choice_score
            ]
            answer_key = json_output["answerKey"]
            if answer_key in selected_answers:
                question_score = 1 / len(selected_answers)
            else:
                question_score = 0
            json_output["selected_answers"] = ",".join(selected_answers)
            json_output["question_score"] = question_score
        json_output["best_span_str"] = span_str
        return sanitize(json_output)

    def _overlap_score(self, answer: str, predicted_span: str) -> float:
        """
        Scores the predicted span against the correct answer by calculating the proportion of the
        stopword-filtered stemmed words in the correct answer covered by the predicted span
        :param answer: correct answer
        :param predicted_span: predicted span
        :return:
        """
        answer_tokens = self._get_tokens(answer)
        # degenerate case: if the answer only has stopwords, we can not score it.
        if not len(answer_tokens):
            return 0.0
        span_tokens = self._get_tokens(predicted_span)
        overlap = [tok for tok in answer_tokens if tok in span_tokens]
        score = len(overlap) / len(answer_tokens)
        return score

    def _get_tokens(self, phrase: str) -> List[str]:
        # Get the stopword-filtered lowercase stemmed tokens from input phrase
        return [
            self._stemmer.stem(word) for word in word_tokenize(phrase)
            if word.lower() not in self._stop_words
        ]
コード例 #40
0
ファイル: Classifier.py プロジェクト: aakashvarma/sms.ai
dataset = pd.read_csv('spam.csv',delimiter = ',',encoding = "ISO-8859-1",engine='python')
dataset = dataset.drop(dataset.columns[[2, 3,4]], axis=1)
dataset['v1'] = dataset['v1'].map({'ham': 0, 'spam': 1}).astype(int)
# Cleaning the texts
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 5572):
    review = re.sub('[^a-zA-Z]', ' ', dataset['v2'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 0].values

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
コード例 #41
0
# 词干提取与词形还原之前先进行分词
tokens = nltk.word_tokenize(text)

# stemming-提取词干
# 导入stem.porter和Lancaster工具包
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
# 实例化PosterStemmer对象
porter_stemmer = PorterStemmer()
# 实例化LancasterStemmer对象
lancaster_stemmer = LancasterStemmer()
# 新建stemmed_list和lancaster_list数组,用于分别存放PorterStemmer和LancasterStemmer的结果
stemmed_list = []
lancaster_list = []
for token in tokens:
    stemmed_list.append(porter_stemmer.stem(token))
    lancaster_list.append(lancaster_stemmer.stem(token))
print("提取词干结果:")
print("1.PorterStemmer:", stemmed_list)
print("2.LancasterStemmer:", lancaster_list)

# Lemmatization-词形还原
# nltk的Lemmatization是基于WordNet实现的,导入WordNetLemmatizer。
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
# 新建lem_list数组,用于存放词形还原
lem_list = []
for token in tokens:
    lem_list.append(wordnet_lemmatizer.lemmatize(token))
print("词形还原结果:")
コード例 #42
0
	try:
		temp = ""

		for i in doc_set:
		    
		    # clean and tokenize document string
		    if data[channels][i]['title'] != "No Title":
		    	
			    raw = data[channels][i]['title'].lower()
			    tokens = tokenizer.tokenize(raw)

			    # remove stop words from tokens
			    stopped_tokens = [data[channels][i]['title'] for data[channels][i]['title'] in tokens if not data[channels][i]['title'] in en_stop]
			    
			    # stem tokens
			    stemmed_tokens = [p_stemmer.stem(data[channels][i]['title']) for data[channels][i]['title'] in stopped_tokens]
			    
			    # add tokens to list
			    texts.append(stemmed_tokens)

		# turn our tokenized documents into a id <-> term dictionary
		dictionary = corpora.Dictionary(texts)
		    
		# convert tokenized documents into a document-term matrix
		corpus = [dictionary.doc2bow(text) for text in texts]

		# generate LDA model
		ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

		ldaAns = ldamodel.print_topics(num_topics=3, num_words=3)
        appendFile.write("\n")
        appendFile.close()

no_punctuation = []
for word in no_stopwords:
    if word.isalpha():
        no_punctuation.append(word)
        appendFile = open('/pfs/out/no_punctuation.txt', 'a', encoding='utf-8')
        appendFile.write(word)
        appendFile.write("\n")
        appendFile.close()

port_stem = PorterStemmer()
stemmed = []
for word in no_punctuation:
    stemmed_word = port_stem.stem(word)
    stemmed.append(stemmed_word)
    appendFile = open('/pfs/out/stemmed.txt', 'a', encoding='utf-8')
    appendFile.write(stemmed_word)
    appendFile.write("\n")
    appendFile.close()

lemmatizer = WordNetLemmatizer()
lemmatized = []
for word in no_punctuation:
    l_text = lemmatizer.lemmatize(word)
    lemmatized.append(l_text)
    appendFile = open('/pfs/out/lemmatized.txt', 'a', encoding='utf-8')
    appendFile.write(l_text)
    appendFile.write("\n")
    appendFile.close()
コード例 #44
0
    # Converting the entire review into lower case
    review = review.lower()

    # Tokenizing the review by words
    review_words = review.split()

    # Removing the stop words
    review_words = [
        word for word in review_words
        if not word in set(stopwords.words('english'))
    ]

    # Stemming the words
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review_words]

    # Joining the stemmed words
    review = ' '.join(review)

    # Creating a corpus
    corpus.append(review)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values

# Creating a pickle file for the CountVectorizer
pickle.dump(cv, open('cv-transform.pkl', 'wb'))
コード例 #45
0
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t")

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

dataset['Review'][0]
clean_review = []

for i in range(1000):
    Review = dataset['Review'][i]
    Review = re.sub('[^a-zA-Z]', ' ', Review)
    Review = Review.lower()
    Review = Review.split()
    Review = [
        ps.stem(token) for token in Review
        if not token in stopwords.words('english')
    ]
    Review = ' '.join(Review)
    clean_review.append(Review)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=4000)
X = cv.fit_transform(clean_review)
X = X.toarray()
y = dataset['Liked'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

print(cv.get_feature_names())
コード例 #46
0
class TextConfidenceGenerator(object):
    def __init__(self, input_file_path, output_file_path):

        self.input_file_path = input_file_path
        self.output_file_path = output_file_path
        self.l2_1_list = []
        self.l4_1_list = []
        self.l5_1_list = []
        self.l6_1_list = []
        self.l2_2_list = []
        self.l4_2_list = []
        self.l5_2_list = []
        self.l6_2_list = []
        self.l7_list = []
        self.title_match_list = []  # added, checking s_itemname in r_title

        # Added For UI
        self.mpn_check_list = []
        self.upc_check_list = []
        self.asin_check_list = []
        self.gtin_check_list = []

        self.text_confidence_score = []

        self.stemmer = PorterStemmer()

    def check_field_contains(self, superset, subset):
        try:
            if type(superset) == float or type(subset) == float:
                return 0
            elif superset == '' or subset == '':
                return 0
            elif (str(superset).lower().find(str(subset).lower()) == -1) & (
                    str(subset).lower().find(str(superset).lower()) == -1):
                return 0
            else:
                return 1
        except:
            return 0

    def check_field_variants(self, superset, variant_info):
        try:
            if (type(variant_info) == float) or variant_info == '':
                return 0
            subsetlist = re.split(r"\s*\^\s*", variant_info)
            agg_score = 0
            for x in subsetlist:
                agg_score = agg_score + self.check_field_contains(
                    superset, str(x))
            return agg_score
        except:
            return 0

    def check_category_in_field(self, superset, category):
        if (type(superset) == float or superset
                == '') or (type(category) == float or category == ''):
            return 0

        categories = category.split('>')
        superset = superset.lower()

        if (type(superset) == float) or (type(categories) == float):
            return 0
        conf = 0
        for cat in categories:
            stemmed_cat = self.stemmer.stem(cat)

            if (stemmed_cat in superset) or (superset in cat.lower()):
                conf = 1
        return conf

    def check_field_in_another_field(self, superset, field):
        if (type(superset) == float or superset == '') or (type(field) == float
                                                           or field == ''):
            return 0
        field = field.lower()
        stemmed_field = self.stemmer.stem(field)
        superset = superset.lower()
        if stemmed_field in superset:
            return 1
        else:
            field_subset = field.split()
            field_subset_match_count = 0
            for part in field_subset:
                if (part in superset) or (self.stemmer.stem(part) in superset):
                    field_subset_match_count += 1
            field_in_another_field_weight = field_subset_match_count / len(
                field_subset)
            field_in_another_field_weight = "%.4f" % field_in_another_field_weight
            return (float(field_in_another_field_weight))

    def check_unique_identifier_match(self, search_id, result_id):
        search_id = str(search_id)
        result_id = str(result_id)

        if (type(search_id) == float or search_id
                == '') or (type(result_id) == float or result_id == ''):
            return 0
        elif search_id in result_id or result_id in search_id:  # handles cases of multiple asin/upc/mpn/gtin
            return 1
        else:
            return 0

    def process(self, row):
        product_sku = row['s_sku']

        l2_1 = self.check_category_in_field(row['r_item_name'],
                                            row['s_category'])
        self.l2_1_list.append(l2_1)
        l4_1 = self.check_field_variants(row['r_item_name'],
                                         row['s_variant_info'])
        self.l4_1_list.append(l4_1)
        l5_1 = self.check_field_in_another_field(row['r_item_name'],
                                                 row['s_manufacturer'])
        self.l5_1_list.append(l5_1)
        l6_1 = self.check_field_contains(row['r_item_name'], row['s_mpn'])
        self.l6_1_list.append(l6_1)

        l2_2 = self.check_category_in_field(row['r_description'],
                                            row['s_category'])
        self.l2_2_list.append(l2_2)
        l4_2 = self.check_field_variants(row['r_description'],
                                         row['s_variant_info'])
        self.l4_2_list.append(l4_2)
        l5_2 = self.check_field_in_another_field(row['r_description'],
                                                 row['s_manufacturer'])
        self.l5_2_list.append(l5_2)
        l6_2 = self.check_field_contains(row['r_description'], row['s_mpn'])
        self.l6_2_list.append(l6_2)
        l7 = self.check_field_contains(row['r_description'],
                                       row['s_item_name'])
        self.l7_list.append(l7)

        title_match = self.check_field_in_another_field(
            row['r_item_name'], row['s_item_name'])
        self.title_match_list.append(title_match)

        # additional checks
        mpn_check = self.check_unique_identifier_match(row['s_mpn'],
                                                       row['r_mpn'])
        upc_check = self.check_unique_identifier_match(row['s_upc'],
                                                       row['r_upc'])
        asin_check = self.check_unique_identifier_match(
            row['s_asin'], row['r_asin'])
        gtin_check = self.check_unique_identifier_match(
            row['s_gtin'], row['r_gtin'])

        self.mpn_check_list.append(mpn_check)
        self.upc_check_list.append(upc_check)
        self.asin_check_list.append(asin_check)
        self.gtin_check_list.append(gtin_check)

        total_row_confidence = l2_1 + l4_1 + l5_1 + l6_1 + l2_2 + l4_2 + l5_2 + l6_2 + l7\
              + title_match + mpn_check + upc_check + gtin_check + asin_check

        # total_row_confidence = l2_1 + l4_1 + l5_1 + l6_1 +  title_match
        self.text_confidence_score.append(total_row_confidence)

    def main(self):
        self.cpi_conf_df = pd.read_csv(self.input_file_path,
                                       sep='\t',
                                       encoding='ISO-8859-1',
                                       dtype=object)

        # self.cpi_conf_df.rename(columns={'s_image': 's_image_url', 's_link': 's_product_url','URL':'search_url', 's_title':'s_item_name'}, inplace=True)
        self.input_file_column_list = self.cpi_conf_df.columns.tolist()
        self.cpi_conf_df = self.cpi_conf_df.fillna(value='')

        search_columns = ['s_mpn','s_upc','s_asin','s_gtin','s_variant_info','s_manufacturer',\
              's_category','s_description']
        result_columns = ['r_mpn','r_upc','r_asin','r_gtin','r_variant_info','r_manufacturer',\
              'r_category','r_description']

        for col in search_columns + result_columns:
            # adding missing columns
            if col not in self.input_file_column_list:
                self.cpi_conf_df[col] = ''

        self.cpi_conf_df.apply(self.process, axis=1)

        column_list = ['sys_index','s_sku','s_product_url','s_item_name','s_category','s_description',\
              's_variant_info','s_manufacturer','s_mpn','s_upc','s_asin','s_gtin','SERP_URL','SERP_KEY',\
              'r_product_url','r_item_name','r_description','r_mpn','r_upc','r_asin','r_gtin',\
              'r_variant_info','r_manufacturer','r_category']

        self.cpi_conf_df = self.cpi_conf_df[column_list]
        self.cpi_conf_df[
            's_vs_r_text_confidence_matrix'] = self.text_confidence_score

        # self.cpi_conf_df['s_category_in_r_title'] = self.l2_1_list
        # self.cpi_conf_df['s_variant_info_in_r_title'] = self.l4_1_list
        # self.cpi_conf_df['s_manufacturer_in_r_title'] = self.l5_1_list
        # self.cpi_conf_df['s_mpn_in_r_title'] = self.l6_1_list

        self.cpi_conf_df['mpn_match'] = self.mpn_check_list
        self.cpi_conf_df['upc_match'] = self.upc_check_list
        self.cpi_conf_df['asin_match'] = self.asin_check_list
        self.cpi_conf_df['gtin_match'] = self.gtin_check_list

        self.cpi_conf_df['title_match'] = self.title_match_list

        self.cpi_conf_df.to_csv(self.output_file_path,
                                index=False,
                                sep='\t',
                                encoding='iso-8859-1')
コード例 #47
0
def word_steam(tokens):
    porter = PorterStemmer()
    return [porter.stem(word) for word in tokens]
コード例 #48
0
from sklearn.metrics import confusion_matrix

# Importing the dataset
data_set = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

# Cleaning the data
corpus = []
data_set_review = data_set['Review']
for index in range(len(data_set_review)):
    review = data_set_review[index]
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    porter_stemmer = PorterStemmer()
    review = [
        porter_stemmer.stem(word) for word in review
        if not word in set(stopwords.words('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)

# Creating the bag of words model
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(corpus).toarray()
y = data_set.iloc[:, -1].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)
コード例 #49
0
# -*- coding: utf-8 -*-

from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

input_words = [
    'writing', 'calves', 'be', 'branded', 'horse', 'randomize', 'possibly',
    'provision', 'hospital', 'kept', 'scratchy', 'code'
]

porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

stemmer_names = ['INPUT WORD', 'PORTER', 'LANCASTER', 'SNOWBALL']
fmt = '{:>16}' * len(stemmer_names)
print(fmt.format(*stemmer_names))
print('=' * 68)

for word in input_words:
    output = [
        word,
        porter.stem(word),
        lancaster.stem(word),
        snowball.stem(word)
    ]
    print(fmt.format(*output))
コード例 #50
0
ファイル: main2.py プロジェクト: Gabe1704/antiProfanity
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np

################################
abuseIndex = 0.00
tokensInput = input("Enter your phrase : ")

tokens = word_tokenize(tokensInput)
tokens = [w for w in tokens if not w in stop_words]

porter = PorterStemmer()
stems = []

for t in tokens:
    stems.append(porter.stem(t))

##########

df = pd.read_csv('experiment.csv')
df = df.drop_duplicates(subset='word')
print(df.head(15))

##########
print(tokens)
print(stems)
###########

for part in tokens:
    df_a = df[df["word"] == part]
    df_aIndex = df_a.index
def LDA_Topic(Int_type, de_stemmer, corp, Text_lda1, my_dictionary,
              Text_tfidf):
    # Defines LDA topic number for search terms/notes/highlights/etc/etc.
    # ------------------- 1 Stop words----------------------

    # <span class="highlight-pink">Cato</span>']

    # print "Input: ", corp
    #raw = re.sub("\d+","",raw)
    #raw = raw.replace("’","'")
    English_stop_words = get_stop_words('en')
    My_list = [
        'span', 'highlight', 'pink', 'class', 'one', 'two', 'three', 'four',
        'five', 'six', 'seven', 'eight', 'nine', 'ten', '://', 'http', 'www',
        'com', 'don', 'pre', 'paid', 'must', 'tcan', 'twhen', 'twhat', 'via',
        'are', 'will', 'said', 'can', 'near', 'and', 'the', 'i', 'a', 'to',
        'it', 'was', 'he', 'of', 'in', 'you', 'that', 'but', 'so', 'on', 'up',
        'we', 'all', 'for', 'out', 'me', 'him', 'they', 'says', 'got', 'then',
        'there', 'no', 'his', 'as', 'with', 'them', 'she', 'said', 'down',
        'see', 'had', 'when', 'about', 'what', 'my', 'well', 'if', 'at',
        'come', 'would', 'by', 'one', 'do', 'be', 'her', "didn't", 'jim',
        'get', "don't", 'time', 'or', 'right', 'could', 'is', 'went', "warn't",
        "ain't", 'good', 'off', 'over', 'go', 'just', 'way', 'like', 'old',
        'around', 'know', 'de', 'now', 'this', 'along', 'en', 'done',
        'because', 'back', "it's", 'tom', "couldn't", 'ever', 'why', 'going',
        'little', 'some', 'your', 'man', 'never', 'too', 'more', 'say', 'says',
        'again', 'how', 'here', 'tell', 'posted', 'need', 'needs', 'someone',
        'government', 'intelligence', 'report'
    ]

    stoplist_1 = set(
        'a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9 0'
        .split(' '))  # Create a set of enlighs alphabets
    stoplist_2 = set()
    #English_stop_words)
    stoplist_3 = set(
        'es la . , . <br> <br><br> br > : >< < .< { } [ ] ( ) .'
        '\' ` " “ ” ? ! - \u201d< \u201d .\u201d \u201d u201d \u2019 \xe9 !< >!'
        .split(' '))  # Create a set
    stoplist_4 = set(My_list)

    stoplist = stoplist_1 | stoplist_2 | stoplist_3 | stoplist_4
    # ------------------- 2 tokenizer ----------------------
    stopped_tokens = [
        [
            word
            for word in WordPunctTokenizer().tokenize(str(document).lower())
            if ((word not in stoplist) & (word != u'.\u201d<')
                & (word != u'.\u201d') & (len(word) > 2)
                & (is_int(word) == False))
        ]  #  & (is_int(word) == False)  & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+',	 word) == False) ) ]
        for document in corp
    ]
    # stopped_tokens = [[word for word in WordPunctTokenizer().tokenize(str(document).lower()) if ((word not in stoplist) & (word != u'.\u201d<') &(word != u'.\u201d') & (len(word) > 2)  & (is_int(word) == False) )]#  & (is_int(word) == False)  & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+',	 word) == False) ) ]
    # for document in corp]

    # ------------------- 3 Stemming and Count word frequencies -------------------
    p_stemmer = PorterStemmer()
    stemmer = {}
    texts = []
    texts_set = []

    for stopped_token in stopped_tokens:
        stemmed_texts = [p_stemmer.stem(i) for i in stopped_token]
        texts_set += [stemmed_texts]

    frequency = defaultdict(int)
    for text in texts_set:
        for token in text:
            frequency[token] += 1

    # Only keep words that appear more than once
    processed_corpus = [[token for token in text if frequency[token] > 0]
                        for text in texts_set]

    # ------------------- 4 Dictionary and TF-IDF Vectors -------------------
    ids2words = my_dictionary.token2id
    bow_corpus = [my_dictionary.doc2bow(text) for text in processed_corpus]

    # print "------------------>>>>: ", corp
    all_vectors = Text_tfidf[
        bow_corpus]  #bow_corpus]   # Gives representative vectors

    # ------------------- 5 Document Vectors and Classification -------------------

    counter = []
    doc_topics = []

    for each in range(0, class_num):
        counter.append(0)

    for index, document in enumerate(
            all_vectors):  # Each documents probability to calss
        doc_topics.append(Text_lda1.get_document_topics(
            document))  # , minimum_probability=0.19)
        new_list = []
        for each_topic in doc_topics[-1]:
            new_list.append(each_topic[1])
        t_index, value = max(enumerate(new_list), key=operator.itemgetter(1))

    # ------------------- 6 Word tags -------------------
    new_list = []
    key_words = []
    i = 0
    # Words from doc TF-IDF Vector
    # Sort word bag of each document
    if len(all_vectors[0]) > 3:
        new_list = sorted(all_vectors[0],
                          key=lambda prob: prob[1],
                          reverse=True)
    else:
        new_list = all_vectors[0]
    # print "Topic: ", t_index + 1

    for i in range(0,
                   len(new_list)):  # Pick the firts 5 keywords in sorted list
        for key in ids2words:
            if ids2words[key] == new_list[i][0]:  # bow_corpus[1][2][0]:
                if (i < 3):  # first 3 keywords, no more
                    term = de_stemmer[key]
                    key_words.append(str(term))
                    topicWordTags[t_index + 1].add(
                        str(term))  # Add this to the bag of words

    # print "summary: ", key_words

    # ------------------- 6 Final Word tags and sorting -------------------
    # temp = [""]
    temp = corp[0]
    if Int_type == "Search":  #
        finalBag[t_index + 1] = finalBag[
            t_index + 1] + ' ' + temp[0] + ' ' + temp[0] + ' ' + temp[0]
        # print "Search: ", temp #finalBag[t_index + 1]
    elif Int_type == "Add note":
        finalBag[t_index +
                 1] = finalBag[t_index + 1] + ' ' + temp[0] + ' ' + temp[0]
    else:
        finalBag[t_index + 1] = finalBag[t_index + 1] + ' ' + temp[0]

    # finalBag[t_index + 1] = finalBag[t_index + 1] + ' ' + temp[0]   #bow_corpus   # Keep adding user interactions string entities...!
    # print "Bag: ", finalBag

    return t_index + 1
コード例 #52
0

spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]

#text preprocess
stop = stopwords.words('english')  
df_new['Answer_processed'] = df_new['Answer'].apply(lambda j: ' '.join([item for item in j.split() if item not in stop]))

for char in spec_chars:
    df_new['Answer_processed'] = df_new['Answer_processed'].str.replace(char, '').str.lower()
    
stemmer = PorterStemmer()
df_new['Answer_processed'] = df_new['Answer_processed'].apply(lambda j: ' '.join([stemmer.stem(item) for item in j.split()]))

length=df_new[df_new['Answer_processed'].map(len)<2].index
df_new.drop(length,inplace=True)
df_new=df_new.reset_index(drop=True)


#vader
FinalResults_Vader = pd.DataFrame()
analyzer = SentimentIntensityAnalyzer()

df_new['scores'] = df_new['Answer'].apply(lambda ans: analyzer.polarity_scores(ans))

df_new['compound'] = df_new['scores'].apply(lambda score_dict: score_dict['compound'])
df_new['positive'] = df_new['scores'].apply(lambda score_dict: score_dict['pos'])
df_new['negative'] = df_new['scores'].apply(lambda score_dict: score_dict['neg'])
コード例 #53
0
#text preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

nltk.download('stopwords')

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
corpus = []
for i in range(len(msg)):
    review = re.sub('[^a-zA-Z]', ' ', msg['text'][i])
    review = review.lower()
    review = review.split()
    review = [
        stemmer.stem(word) for word in review
        if not word in stopwords.words('english')
    ]
    review = ' '.join(review)
    corpus.append(review)

corpus

#creating bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, ngram_range=(1, 3))
X = cv.fit_transform(corpus).toarray()

X

X.shape
def entity_summary(my_dictionary1, docs_number, ids2words, doc_vectors,
                   output_topics, doc_topics, de_stemmer, class_num,
                   keyword_num, filename, filename2, filename3):

    English_stop_words = get_stop_words('en')
    My_list = [
        "u'\u201c'", 'span', 'highlight', 'pink', 'class', 'one', 'two',
        'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', '://',
        'http', 'www', 'com', 'don', 'pre', 'paid', 'must', 'tcan', 'twhen',
        'twhat', 'via', 'are', 'will', 'said', 'can', 'near', 'and', 'the',
        'i', 'a', 'to', 'it', 'was', 'he', 'of', 'in', 'you', 'that', 'but',
        'so', 'on', 'up', 'we', 'all', 'for', 'out', 'me', 'him', 'they',
        'says', 'got', 'then', 'there', 'no', 'his', 'as', 'with', 'them',
        'she', 'said', 'down', 'see', 'had', 'when', 'about', 'what', 'my',
        'well', 'if', 'at', 'come', 'would', 'by', 'one', 'do', 'be', 'her',
        "didn't", 'jim', 'get', "don't", 'time', 'or', 'right', 'could', 'is',
        'went', "warn't", "ain't", 'good', 'off', 'over', 'go', 'just', 'way',
        'like', 'old', 'around', 'know', 'de', 'now', 'this', 'along', 'en',
        'done', 'because', 'back', "it's", 'tom', "couldn't", 'ever', 'why',
        'going', 'little', 'some', 'your', 'man', 'never', 'too', 'more',
        'say', 'says', 'again', 'how', 'here', 'tell', 'posted', 'need',
        'needs', 'someone', 'government', 'intelligence', 'report'
    ]

    stoplist_1 = set(
        'a b c d e guy f size styled g h also number details since due countries using selling sent given earlier completely owed full player numerous thus recovered number i j k unknown move l m n o p q r else s t u v w x y z first becomes able actually absolutely necessary officialise entire stage issued'
        .split(' '))  # Create a set of enlighs alphabets
    stoplist_2 = set(English_stop_words)
    stoplist_3 = set(
        'es la . , . taken <br> however require ratio note illumination homeland give order possibly think questions event hour case occurred yet confirmed destination million want update arrived removed responsibility known claiming icon role display none stating closed work apply research provided additional closed caused showed month succeeded knowledge stop coroner style index enclosed sudden seeks wait last soon centers outside believed feet happened begins colors hour people airing large claims area getting blkd highly whose young information made year ptf create make public date text tried space found name run ome ngoki agree everyone caller identification <br><br> br > : >< < .< { } [ ] ( ) .'
        '\' ` " “ ” ? ! - \u2018 \xe9 \u201c \u201d< \u201d .\u201d \u201d u201d \u201c looking .\u201d< \u2019 worth realized facilitated \xe9 keeping !< >! ago note sending'
        .split(' '))  # Create a set
    stoplist_4 = set(My_list)

    stoplist = stoplist_1 | stoplist_2 | stoplist_3 | stoplist_4

    p_stemmer = PorterStemmer()
    # ----------------- Process wordtags from user interactions -----------------------------------
    timeList = [
        'date', 'jan', 'january', 'feb', 'february', 'march', 'april', 'may',
        'present', 'jun', 'july', 'august', 'september', 'october', 'november',
        'december', '1998'
    ]
    placeList = [
        'engstrom', 'gastech', 'abila', 'kronos', 'petra', 'jet', 'limousine',
        'tethan', '', '', '', 'headquarters', 'tethys', 'elodis', 'airport',
        'airports', 'vastopolis', 'terrorist', 'brotherhood', 'antarctica',
        'washington', 'dhs', 'valujet', 'laboratory', 'dharan', 'bahrain',
        'qatar', 'kuwait', 'airlines', 'vastpress', 'ibm', 'suburbia', 'bruno',
        'lab', 'antarctica', 'nigeria', 'dubai', 'burj', 'syria', 'gaza',
        'sanaa', 'ebilaead', 'tabriz', 'venezuela', 'pakistan', 'countries',
        'saudi', 'arabia', 'kenya', 'iran', 'lebanon', 'russia', 'yemen',
        'turkey', 'arkadi', 'barcelona', 'paris', 'cafe', 'mosque',
        'exhibition', 'valley', 'moscow', 'downtown', 'mombasa', 'bangkok',
        'sudan', 'usa', 'washington', 'milan', 'italy', 'hospital', 'british',
        'soviet', 'antalya', 'malaysia', 'somalia', 'sana', 'lagos',
        'pyongyang', 'uae', 'kiev', 'hotel'
    ]
    peopleList = [
        'edvard', 'employee', 'ipo', 'president', 'firemen', 'apa', 'silvia',
        'protectors', '', 'wgo', 'torsten', 'juliana', 'dread', 'networks',
        'sanjorge', 'vann', 'employees', 'pok', 'sten', 'cato', 'ceo',
        'rebecca', 'karel', 'wfa', 'elian', 'carman', 'kapelou', 'nespola',
        'torsten', 'trucco', 'douglas', 'eggleston', 'lark', 'mayor', 'afghan',
        'philippines', 'paramurderers', 'bruno', 'psychobrotherhood',
        'pakistani', 'hasidic', 'brothers', 'hate', 'george', 'dombrovski',
        'columbia', 'mikhail', 'Kapolalum ', 'funsho', 'bukhari', 'ahmed',
        'basra', 'khouri', 'kasem', 'leonid', 'nahid', 'otieno', 'owiti',
        'leonid', 'baltasar', 'hombre', 'jhon', 'professor', 'saleh', 'tanya',
        'mohammed', 'borodinski', 'kashfi', 'khemkhaeng', 'boonmee',
        'ukrainian', 'german', 'italian', 'dutch', 'french', 'kapolalum',
        'funsho', 'mai', 'korongi', 'lashkar', 'hosain', 'haq', 'maulana',
        'bukhari', 'arab', 'ali', 'balochi', 'nicolai', 'aden', 'akram',
        'shamsheer', 'jeddah', 'kiev', 'abdullah', 'carabobo', 'bolivar',
        'bhutani', 'jumeirah', 'michieka', 'borodinski', 'otieno', 'wanjohi',
        'onyango', 'kenyan', 'nairobi', 'jtomski', 'hakan', 'vwhombre',
        'jorge', 'soltan', 'anka', 'green', 'joetomsk', 'igor', 'middleman'
    ]
    for j in range(1, len(finalBag)):  # Each topic
        # print "\n \n \n " , finalBag[j]

        # print "topic#: ", j

        # ------------------- 2 tokenizer ----------------------
        # print finalBag[j]
        # ------------------- 2 tokenizer ----------------------
        stopped_tokens = [
            [
                word for word in WordPunctTokenizer().tokenize(
                    str(document).lower())
                if ((word not in stoplist) & (word != u'.\u201d<')
                    & (word != u'\xe9') & (word != u'\u2018')
                    & (word != u'.\u201d') & (word != u'\u201c')
                    & (word != '\u201c') & (len(word) > 2)
                    & (is_int(word) == False))
            ]  #  & (is_int(word) == False)  & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+',	 word) == False) ) ]
            for document in [finalBag[j]]
        ]
        # stopped_tokens = [[word for word in WordPunctTokenizer().tokenize(str(document).lower()) if ((word not in stoplist) & (word != u'.\u201d<') &(word != u'.\u201d') & (len(word) > 2)  & (is_int(word) == False) )]#  & (is_int(word) == False)  & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+',	 word) == False) ) ]
        # for document in [finalBag[j]]]
        # print "Final stopped", stopped_tokens
        # ------------------- 3 Stemming and Count word frequencies -------------------
        # p_stemmer = PorterStemmer()
        stemmer = {}
        texts = []
        texts_set = []  #set()

        for stopped_token in stopped_tokens:
            stemmed_texts = [p_stemmer.stem(i) for i in stopped_token]
            texts_set += [stemmed_texts]

        frequency = defaultdict(int)
        for text in texts_set:
            for token in text:
                frequency[token] += 1

        # Only keep words that appear more than once
        processed_corpus = [[token for token in text if frequency[token] > 0]
                            for text in texts_set]
        # print "\n Final proceesd", processed_corpus
        # ------------------- 4 Dictionary and TF-IDF Vectors -------------------
        ids2words = my_dictionary.token2id
        bow_corpus = [my_dictionary.doc2bow(text) for text in processed_corpus]

        # final_vectors = Text_tfidf[bow_corpus]    # With TF-IDF
        final_vectors = bow_corpus  # No TF-IDF

        # print "Input: ", bow_corpus
        # print "Input: ", final_vectors[0]
        new_list = []
        key_words = []
        # Words from doc TF-IDF Vector
        # Sort word bag of each document
        if len(final_vectors[0]) > 2:
            new_list = sorted(final_vectors[0],
                              key=lambda prob: prob[1],
                              reverse=True)
        else:
            new_list = final_vectors[0]

        # print "\n Bag of sorted: ", new_list
        # k = 0;
        # for i in range(0,len(new_list)):   # Pick the firts 10 keywords in sorted list
        # for key in ids2words:
        # if ids2words[key] == new_list[i][0]: # bow_corpus[1][2][0]:
        # if (k<10):              # first 3 keywords, no more
        # term = de_stemmer[key]
        # topicWordTags2[j].add(str(term))   # Add this to the bag of words
        # k = k+1
        accu = 0
        for each in new_list:
            accu += each[1]
            # print each, each[0], each[1]

        for i in range(
                0, len(new_list)):  # Pick the firts 10 keywords in sorted list
            for key in ids2words:
                if ids2words[key] == new_list[i][0]:  # bow_corpus[1][2][0]:
                    term = de_stemmer[key]
                    if (term in timeList):
                        group = 0
                    elif (term in placeList):
                        group = 1
                    elif (term in peopleList):
                        group = 2
                    else:
                        group = 3
                    score = float(new_list[i][1]) / accu
                    # print "score", float(new_list[i][1]), accu, score
                    # if score > 0.01:
                    topicWordTags2[j].append(
                        [str(term), group,
                         score])  # Add this to the bag of words

        print "\n Final Entites: ", len(new_list), topicWordTags2[j]

    # ----------------------- Process word tags from document vectors  (to complete 10 minimum tags for each topic)
    for j in range(1, len(doc_topic_keywords)):  # Each topic

        # ------------------- 2 tokenizer ----------------------
        stopped_tokens = [
            [
                word for word in WordPunctTokenizer().tokenize(
                    str(document).lower())
                if ((word not in stoplist) & (word != u'.\u201d<')
                    & (word != u'\xe9') & (word != u'\u2018')
                    & (word != u'.\u201d') & (word != u'\u201c')
                    & (word != '\u201c') & (len(word) > 2)
                    & (is_int(word) == False))
            ]  #  & (is_int(word) == False)  & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+',	 word) == False) ) ]
            for document in [doc_topic_keywords[j]]
        ]

        # ------------------- 3 Stemming and Count word frequencies -------------------
        stemmer = {}
        texts = []
        texts_set = []  #set()

        for stopped_token in stopped_tokens:
            stemmed_texts = [p_stemmer.stem(i) for i in stopped_token]
            texts_set += [stemmed_texts]

        frequency = defaultdict(int)
        for text in texts_set:
            for token in text:
                frequency[token] += 1

        # Only keep words that appear more than once
        processed_corpus = [[token for token in text if frequency[token] > 0]
                            for text in texts_set]
        # print "\n Final proceesd", processed_corpus
        # ------------------- 4 Dictionary and TF-IDF Vectors -------------------
        ids2words = my_dictionary.token2id
        bow_corpus = [my_dictionary.doc2bow(text) for text in processed_corpus]

        # final_vectors = Text_tfidf[bow_corpus]    # With TF-IDF
        final_vectors = bow_corpus  # No TF-IDF

        # print "Input: ", bow_corpus
        # print "Input: ", final_vectors[0]
        new_list = []
        key_words = []
        # Words from doc TF-IDF Vector
        # Sort word bag of each document
        if len(final_vectors[0]) > 2:
            new_list = sorted(final_vectors[0],
                              key=lambda prob: prob[1],
                              reverse=True)

        else:
            new_list = final_vectors[0]
        # print "\n Bag of sorted: ", new_list
        k = 0
        for i in range(
                0, len(new_list)):  # Pick the firts 10 keywords in sorted list
            for key in ids2words:
                if ids2words[key] == new_list[i][0]:  # bow_corpus[1][2][0]:
                    if (k < 20):  # first 3 keywords, no more
                        term = de_stemmer[key]
                        # topicWordTags3[j].add(str(term))   # Add this to the bag of words
                        topicWordTags3[j].append(
                            [str(term), 3,
                             0.1])  # Add this to the bag of words
                        k = k + 1
        # print "\n Final Entites: ", topicWordTags2[j]

    # ------------------------------ Add entities from user interactions ----------------------
    topic_hash = []

    for i in range(1, class_num + 1):  # topicWordTags[0] is always empty,
        tagWords = []
        temp_set = set()
        kk = 0
        for eachTag in topicWordTags2[i]:
            if kk < 20:
                if not (eachTag[0] in temp_set):
                    temp_set.add(eachTag[0])
                    tagWords.append(eachTag)
                    kk = kk + 1

        # ------------------------------ Add more entities from documents -----------------------
        #print topicWordTags3[i]
        if kk < 20:
            for eachTag in topicWordTags3[i]:
                if kk < 20:
                    if not (eachTag[0] in temp_set):
                        temp_set.add(eachTag[0])
                        #print "set > ", temp_set
                        tagWords.append(eachTag)
                        #print "List > ", tagWords
                        kk = kk + 1

        tagWords = sorted(tagWords, key=lambda k: k[2], reverse=True)
        temp = {"TopicNum: ": i - 1, "keywords": tagWords}
        topic_hash.append(temp)

    fout = open(filename3, "w")
    fout.write(json.dumps(topic_hash, indent=1))
    fout.close()

    # print "\n doc_topic_array: ", doc_topic_array
    # print "\n doc_topic_array: ", doc_key_word
    return
コード例 #55
0
review1

# # Stemming:
# - Convert word to its root word
#
# Eg:
# loved ----> love

# In[18]:

# Use Stemming to take word it to its Root form

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

review1 = [ps.stem(word) for word in review1]
review1

# In[19]:

# Convert list to string

review2 = ' '.join(review1)
review2

# ### Count-Vectorizer( )
# - This will construct the vocabulary of the bag-of-words model and transform the sentences into sparse feature vectors

# In[20]:

corpus1 = []
コード例 #56
0
for channels in data:
	# list for tokenized documents in loop
	texts = []
	doc_set = data[channels]
	# loop through document list
	for i in doc_set:
	    
	    # clean and tokenize document string
	    raw = i[0].lower()
	    tokens = tokenizer.tokenize(raw)

	    # remove stop words from tokens
	    stopped_tokens = [i[0] for i[0] in tokens if not i[0] in en_stop]
	    
	    # stem tokens
	    stemmed_tokens = [p_stemmer.stem(i[0]) for i[0] in stopped_tokens]
	    
	    # add tokens to list
	    texts.append(stemmed_tokens)

	# turn our tokenized documents into a id <-> term dictionary
	dictionary = corpora.Dictionary(texts)
	    
	# convert tokenized documents into a document-term matrix
	corpus = [dictionary.doc2bow(text) for text in texts]

	# generate LDA model
	ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

	ldaAns = ldamodel.print_topics(num_topics=3, num_words=3)
コード例 #57
0
def token2features(sent, i, add_neighs=True):
    """Compute the features of a token.

    All the features are boolean, i.e. they appear or they do not. For the token,
    you have to return a set of strings that represent the features that *fire*
    for the token. See the code below.

    The token is at position i, and the rest of the sentence is provided as well.
    Try to make this efficient, since it is called on every token.

    One thing to note is that it is only called once per token, i.e. we do not call
    this function in the inner loops of training. So if your training is slow, it's
    not because of how long it's taking to run this code. That said, if your number
    of features is quite large, that will cause slowdowns for sure.

    add_neighs is a parameter that allows us to use this function itself in order to
    recursively add the same features, as computed for the neighbors. Of course, we do
    not want to recurse on the neighbors again, and then it is set to False (see code).
    """
    porter = PorterStemmer()
    ftrs = []
    # bias
    ftrs.append("BIAS")
    # position features
    if i == 0:
        ftrs.append("SENT_BEGIN")
    if i == len(sent) - 1:
        ftrs.append("SENT_END")

    # the word itself
    word = unicode(sent[i])
    ftrs.append("WORD=" + word)
    ftrs.append("LCASE=" + word.lower())
    # Adding stemmed version of word.
    ftrs.append("STEMMED=" + porter.stem(word))
    # some features of the word
    if word.isalnum():
        ftrs.append("IS_ALNUM")
    if word.isnumeric():
        ftrs.append("IS_NUMERIC")
    if word.isdigit():
        ftrs.append("IS_DIGIT")
    if word.isupper():
        ftrs.append("IS_UPPER")
    if word.islower():
        ftrs.append("IS_LOWER")
    # Additional features
    if word.startswith("http") or word.endswith(".com"):
        ftrs.append("IS_URL")
    if word in abbreviations:
        ftrs.append("IS_ABRV")
    if word.startswith("#"):
        ftrs.append("IS_HASHTAG")
    if word.startswith("@"):
        ftrs.append("IS_MENTION")
    # previous/next word feats
    if add_neighs:
        if i > 0:
            for pf in token2features(sent, i - 1, add_neighs=False):
                ftrs.append("PREV_" + pf)
        if i < len(sent) - 1:
            for pf in token2features(sent, i + 1, add_neighs=False):
                ftrs.append("NEXT_" + pf)

    # return it!
    return ftrs
コード例 #58
0
# print(dataset[90:100])

# Cleaning text
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [
        ps.stem(x) for x in review if not x in set(stopwords.words('english'))
    ]
    review = ' '.join(review)
    corpus.append(review)
# print(corpus)

# Creating the Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

# Using NaiveBayes on dependent and independent variables
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
コード例 #59
0
class Preprocess:

    def __init__(self, text):

        self.text = text
        self.STOPWORDS = set(stopwords.words('english'))
        self.spell = SpellChecker()
        self.p = inflect.engine()
        self.nlp = en_core_web_sm.load()
        #self.nlp = spacy.load('en_core_web_md')
        self.model = api.load("glove-twitter-25")
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()

    
    def strip_html_tags(self):

        """remove html tags from text"""
        soup = BeautifulSoup(self.text, "html.parser")
        stripped_text = soup.get_text(separator=" ")
        return stripped_text
    

    def remove_accented_chars(self):

        """remove accented characters from text, e.g. café"""
        text = unidecode.unidecode(self.text)
        return text

    
    '''def expand_contractions(self, text):
        """expand shortened words, e.g. don't to do not"""
        text = list(cont.expand_texts([text], precise=True))[0]
        return text'''
    

    def pos_tagging(self): 

        word_tokens = word_tokenize(self.text) 
        return pos_tag(word_tokens)

    
    def text_lowercase(self): 

        return self.text.lower()


    def text_uppercase(self): 

        return self.text.upper()

    
    def remove_numbers(self): 

        result = re.sub(r'\d+', '', self.text) 
        return result

    
    def convert_number(self): 

        # split string into list of words 
        temp_str = self.text.split() 
        # initialise empty list 
        new_string = [] 
    
        for word in temp_str: 
            # if word is a digit, convert the digit 
            # to numbers and append into the new_string list 
            if word.isdigit(): 
                temp = p.number_to_words(word) 
                new_string.append(temp) 
    
            # append the word as it is 
            else: 
                new_string.append(word) 
    
        # join the words of new_string to form a string 
        temp_str = ' '.join(new_string) 
        return temp_str
    

    def remove_punctuation(self): 

        translator = str.maketrans('', '', string.punctuation) 
        return self.text.translate(translator)

    
    def remove_whitespace(self): 

        return  " ".join(self.text.split()) 

    
    def remove_stopwords(self):

        """custom function to remove the stopwords"""
        return " ".join([word for word in str(self.text).split() if word not in self.STOPWORDS])

    
    def stem_words(self):

        return " ".join([self.stemmer.stem(word) for word in self.text.split()])


    def lemmatize_words(self):

        return " ".join([self.lemmatizer.lemmatize(word) for word in self.text.split()])

    
    def remove_freqwords(self, df, column_name):

        """custom function to remove the frequent words"""

        cnt = Counter()

        for self.text in df["text_wo_stop"].values:
            for word in self.text.split():
                cnt[word] += 1
        FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])

        return " ".join([word for word in str(self.text).split() if word not in FREQWORDS])

    
    def remove_emoji(self):

        emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', self.text)

    
    def remove_emoticons(self):

        emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
        return emoticon_pattern.sub(r'', self.text)

    
    def convert_emoticons(self):

        for emot in EMOTICONS:
            text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), self.text)
        return text


    def remove_urls(self):

        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', self.text)

    
    def remove_html(self):

        html_pattern = re.compile('<.*?>')
        return html_pattern.sub(r'', self.text)

    
    def correct_spellings(self):

        corrected_text = []
        misspelled_words = self.spell.unknown(self.text.split())

        for word in self.text.split():
            if word in misspelled_words:
                corrected_text.append(self.spell.correction(word))
            else:
                corrected_text.append(word)

        return " ".join(corrected_text)


    def NER(self):

        doc = self.nlp(self.text)
        entity_label_map = dict()

        for entity in doc.ents:
            entity_label_map[entity.self.text] = entity.label_
        
        return entity_label_map
コード例 #60
0
# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:

    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics=2,
                                           id2word=dictionary,
                                           passes=20)