Example #1
0
    def SimpleSpamFilter(self, docs):
        import string
        corpus = []

        for text in docs:
            t = self.FilterMentions(text)
            t = self.FilterLinks(t)
            t = re.sub(re.compile(r'\s+'), ' ', t)
            # t = ' '.join(t.split())
            t = ''.join([i for i in t if i not in set(string.punctuation)])
            t = ''.join([i for i in t if not i.isdigit()])
            t = t.strip().lower()

            if t not in self.scanned and t:
                if self.possible_spam:
                    for spam in self.possible_spam:
                        if re.search(re.escape(t), spam) or re.search(
                                re.escape(spam), t):
                            # self.redundant.append(text)
                            self.possible_spam.append(t)
                            self.possible_spam = list(set(self.possible_spam))
                        else:
                            corpus.append(text)
                else:
                    corpus.append(text)
                self.scanned.append(t)
            elif t:
                # self.redundant.append(text)
                self.possible_spam.append(t)
                self.possible_spam = list(set(self.possible_spam))
        return list(set(filter(None, corpus)))
Example #2
0
def create_corpus_df(tweet, target):
    corpus = []

    for x in tweet[tweet['target'] == target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus
Example #3
0
 def SimpleSpamFilter(self, docs):
     import string
     corpus = []
     
     for text in docs:
         t = self.FilterMentions(text)
         t = self.FilterLinks(t)
         t = re.sub(re.compile(r'\s+'), ' ', t)
         # t = ' '.join(t.split())
         t = ''.join([i for i in t if i not in set(string.punctuation)])
         t = ''.join([i for i in t if not i.isdigit()])
         t = t.strip().lower()
         
         if t not in self.scanned and t:
             if self.possible_spam:
                 for spam in self.possible_spam:
                     if re.search(re.escape(t),spam) or re.search(re.escape(spam),t):
                         # self.redundant.append(text)
                         self.possible_spam.append(t)
                         self.possible_spam = list(set(self.possible_spam))
                     else:
                         corpus.append(text)
             else: corpus.append(text)
             self.scanned.append(t)
         elif t:
             # self.redundant.append(text)
             self.possible_spam.append(t)
             self.possible_spam = list(set(self.possible_spam))
     return list(set(filter(None, corpus)))
def tokenize_and_crawl(article):
    """
    Function called if the news article not found in the hive database, crawl the web for it.
    :param article:
    :return: None
    """

    # Tokenize and lemmatize the input article
    article_string = ""

    for i in article:
        article_string += i
        article_string += " "

    tokens = word_tokenize(article_string)

    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token)
              for token in tokens]  # Stemming to a common root

    stop_w = stopwords.words('english')
    w = []

    # Remove stop words
    for z in tokens:
        if z not in stop_w:
            w.append(z)

    query_string = ""

    for o in w:
        query_string += o
        query_string += "+"

    # Crawl using google news API
    print "Keywords used to crawl NEWS sites for this article are :", query_string
    crawl_url = "http://news.google.com/news?q=" + query_string + "&output=rss"
    feed = feedparser.parse(crawl_url)

    corpus = []

    print "------------------------------------------------------"
    print "CRAWLING RESULTS : "
    for x in feed.entries:
        print x['title']
        corpus.append(x['title'])

    string_cosine_correlation_level3(article, corpus)
def hive_news_dump_connection():
    with pyhs2.connect(host='192.168.56.101',
                       port=10000,
                       authMechanism='PLAIN',
                       user='******',
                       password='******',
                       database='anuvrat') as conn:

        temp_query = 'SELECT name, message, description FROM ABC_NEWS LIMIT 10'
        with conn.cursor() as cur:
            cur.execute(temp_query)

            corpus = []

            for x in cur.fetchall():
                for i in x:
                    i = i.replace('\xef\xbf\xbd\xef\xbf\xbd', '')
                    corpus.append(i.replace('\x00', ''))

            return corpus
#stock_onlycols['Top1'] = stock_onlycols['Top1'].apply(lambda x: [item for item in x if item not in stop])
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

headlines = []
for i in range(0, len(stock_onlycols.columns)):
    headlines.append(' '.join(str(x) for x in stock_onlycols.iloc[i, 0:25]))
corpus = []
for i in range(0, len(headlines)):
    words = headlines[i].split()
    words = [
        lemmatizer.lemmatize(word) for word in words
        if word not in set(stopwords.words('english'))
    ]
    headlines[i] = ' '.join(words)
    corpus.append(words)

lemmatizer = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer
#cv = CountVectorizer(max_features=2500)
X = CountVectorizer(tokenizer=lambda doc: doc,
                    lowercase=False,
                    max_features=1199).fit_transform(corpus).toarray()
y = train['Label']
X = X.reshape(1199, 25)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
Example #7
0
    # remove and replace other words with space, excluding a-z & A-Z
    review = re.sub('[^a-zA-Z]', ' ', sms['message'][i])
    # convert to lower case
    review = review.lower()
    # splitting the words
    review = review.split()

    # filter out the stopwords and stem the remaining words
    review = [
        ps.stem(word) for word in review
        if word is not set(stopwords.words('english'))
    ]
    # then join
    review = ' '.join(review)
    # and append to corpus
    corpus.append(review)

# convert labels to dummy variable
y = pd.get_dummies(sms['labels'])
y = y.iloc[:, 1].values

# implement BoW from sklearn using the countvectorizer library
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

# import test train split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=4)
Example #8
0
def create_corpus_new(df):
    corpus = []
    for tweet in tqdm(df['text']):
        words = [word.lower() for word in word_tokenize(tweet)]
        corpus.append(words)
    return corpus
    y = pickle.load(f)

corpus = []
#pre-processing text
for i in range(0, len(x)):
    #removing non charactera[1,,.:;`!....ect]
    data = re.sub(r'\W', ' ', str(x[i]))
    #convert lower into upper case
    data = data.lower()
    #removing single characters in between the text
    data = re.sub(r'\s+[a-z]\s+', ' ', data)
    #removing single characters at starting sentence
    data = re.sub(r'^[a-z]\s+', ' ', data)
    #removing extra spaces
    data = re.sub(r'\s+', ' ', data)
    corpus.append(data)

#building TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer(max_features=2000,
                         min_df=3,
                         max_df=0.6,
                         stop_words=stopwords.words('english'))
x = vector.fit_transform(corpus).toarray()

#pickling the TFIDF model
with open("TFIDF.pickle", 'wb') as f:
    pickle.dump(vector, f)

#unpickling the TFIDF model