def SimpleSpamFilter(self, docs): import string corpus = [] for text in docs: t = self.FilterMentions(text) t = self.FilterLinks(t) t = re.sub(re.compile(r'\s+'), ' ', t) # t = ' '.join(t.split()) t = ''.join([i for i in t if i not in set(string.punctuation)]) t = ''.join([i for i in t if not i.isdigit()]) t = t.strip().lower() if t not in self.scanned and t: if self.possible_spam: for spam in self.possible_spam: if re.search(re.escape(t), spam) or re.search( re.escape(spam), t): # self.redundant.append(text) self.possible_spam.append(t) self.possible_spam = list(set(self.possible_spam)) else: corpus.append(text) else: corpus.append(text) self.scanned.append(t) elif t: # self.redundant.append(text) self.possible_spam.append(t) self.possible_spam = list(set(self.possible_spam)) return list(set(filter(None, corpus)))
def create_corpus_df(tweet, target): corpus = [] for x in tweet[tweet['target'] == target]['text'].str.split(): for i in x: corpus.append(i) return corpus
def SimpleSpamFilter(self, docs): import string corpus = [] for text in docs: t = self.FilterMentions(text) t = self.FilterLinks(t) t = re.sub(re.compile(r'\s+'), ' ', t) # t = ' '.join(t.split()) t = ''.join([i for i in t if i not in set(string.punctuation)]) t = ''.join([i for i in t if not i.isdigit()]) t = t.strip().lower() if t not in self.scanned and t: if self.possible_spam: for spam in self.possible_spam: if re.search(re.escape(t),spam) or re.search(re.escape(spam),t): # self.redundant.append(text) self.possible_spam.append(t) self.possible_spam = list(set(self.possible_spam)) else: corpus.append(text) else: corpus.append(text) self.scanned.append(t) elif t: # self.redundant.append(text) self.possible_spam.append(t) self.possible_spam = list(set(self.possible_spam)) return list(set(filter(None, corpus)))
def tokenize_and_crawl(article): """ Function called if the news article not found in the hive database, crawl the web for it. :param article: :return: None """ # Tokenize and lemmatize the input article article_string = "" for i in article: article_string += i article_string += " " tokens = word_tokenize(article_string) lemmatizer = nltk.WordNetLemmatizer() tokens = [lemmatizer.lemmatize(token) for token in tokens] # Stemming to a common root stop_w = stopwords.words('english') w = [] # Remove stop words for z in tokens: if z not in stop_w: w.append(z) query_string = "" for o in w: query_string += o query_string += "+" # Crawl using google news API print "Keywords used to crawl NEWS sites for this article are :", query_string crawl_url = "http://news.google.com/news?q=" + query_string + "&output=rss" feed = feedparser.parse(crawl_url) corpus = [] print "------------------------------------------------------" print "CRAWLING RESULTS : " for x in feed.entries: print x['title'] corpus.append(x['title']) string_cosine_correlation_level3(article, corpus)
def hive_news_dump_connection(): with pyhs2.connect(host='192.168.56.101', port=10000, authMechanism='PLAIN', user='******', password='******', database='anuvrat') as conn: temp_query = 'SELECT name, message, description FROM ABC_NEWS LIMIT 10' with conn.cursor() as cur: cur.execute(temp_query) corpus = [] for x in cur.fetchall(): for i in x: i = i.replace('\xef\xbf\xbd\xef\xbf\xbd', '') corpus.append(i.replace('\x00', '')) return corpus
#stock_onlycols['Top1'] = stock_onlycols['Top1'].apply(lambda x: [item for item in x if item not in stop]) from nltk.tokenize import sent_tokenize, word_tokenize from nltk.stem import WordNetLemmatizer headlines = [] for i in range(0, len(stock_onlycols.columns)): headlines.append(' '.join(str(x) for x in stock_onlycols.iloc[i, 0:25])) corpus = [] for i in range(0, len(headlines)): words = headlines[i].split() words = [ lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english')) ] headlines[i] = ' '.join(words) corpus.append(words) lemmatizer = WordNetLemmatizer() from sklearn.feature_extraction.text import CountVectorizer #cv = CountVectorizer(max_features=2500) X = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, max_features=1199).fit_transform(corpus).toarray() y = train['Label'] X = X.reshape(1199, 25) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,
# remove and replace other words with space, excluding a-z & A-Z review = re.sub('[^a-zA-Z]', ' ', sms['message'][i]) # convert to lower case review = review.lower() # splitting the words review = review.split() # filter out the stopwords and stem the remaining words review = [ ps.stem(word) for word in review if word is not set(stopwords.words('english')) ] # then join review = ' '.join(review) # and append to corpus corpus.append(review) # convert labels to dummy variable y = pd.get_dummies(sms['labels']) y = y.iloc[:, 1].values # implement BoW from sklearn using the countvectorizer library cv = CountVectorizer() X = cv.fit_transform(corpus).toarray() # import test train split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
def create_corpus_new(df): corpus = [] for tweet in tqdm(df['text']): words = [word.lower() for word in word_tokenize(tweet)] corpus.append(words) return corpus
y = pickle.load(f) corpus = [] #pre-processing text for i in range(0, len(x)): #removing non charactera[1,,.:;`!....ect] data = re.sub(r'\W', ' ', str(x[i])) #convert lower into upper case data = data.lower() #removing single characters in between the text data = re.sub(r'\s+[a-z]\s+', ' ', data) #removing single characters at starting sentence data = re.sub(r'^[a-z]\s+', ' ', data) #removing extra spaces data = re.sub(r'\s+', ' ', data) corpus.append(data) #building TFIDF model from sklearn.feature_extraction.text import TfidfVectorizer vector = TfidfVectorizer(max_features=2000, min_df=3, max_df=0.6, stop_words=stopwords.words('english')) x = vector.fit_transform(corpus).toarray() #pickling the TFIDF model with open("TFIDF.pickle", 'wb') as f: pickle.dump(vector, f) #unpickling the TFIDF model