Example #1
0
    def get_feature(self):
        ds = DataSet(self.name, self.path)
        data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc, self.sent)

        vocabulary = set()
        headline_body = []
        for i, row in data.iterrows():
            vocabulary.update(row['Headline'].split(' '))
            vocabulary.update(row['Body'].split(' '))
            headline_body.append(row['Headline'] + ' ' + row['Body'])
        headlines = data.Headline.to_numpy()
        bodies = data.Body.to_numpy()

        if self.name == 'train':
            vectorizer = TfidfVectorizer(vocabulary=vocabulary)
            headline_body_TF_IDF = vectorizer.fit(headline_body)
            with open('../feature_files/headline_body_tfidf_vectorizer.pkl', 'wb') as f:
                pickle.dump(headline_body_TF_IDF, f)
            headline_TF_IDF = headline_body_TF_IDF.transform(headlines)
            body_TF_IDF = headline_body_TF_IDF.transform(bodies)
        else:
            headline_body_TF_IDF = pickle.load(open('../feature_files/headline_body_tfidf_vectorizer.pkl', 'rb'))
            headline_TF_IDF = headline_body_TF_IDF.transform(headlines)
            body_TF_IDF = headline_body_TF_IDF.transform(bodies)

        features = []
        for h, b in zip(headline_TF_IDF, body_TF_IDF):
            features.append(sklearn.metrics.pairwise.cosine_similarity(h, b)[0][0])
        return np.array(features)
Example #2
0
    def get_feature(self):
        def get_sentiment(d):
            return list(sid.polarity_scores(d).values())

        ds = DataSet(path=self.path, name=self.name)
        data = ds.preprocess(self.lemmatize, self.remove_stop,
                             self.remove_punc, self.sent)
        sid = SentimentIntensityAnalyzer()
        sentiments = []
        for index, row in data.iterrows():
            headline_sentiment = get_sentiment(row['Headline'])
            body_sentiment = get_sentiment(row['Body'])
            sentiments.append([headline_sentiment + body_sentiment])
        return np.array(sentiments).reshape(-1, 8)
Example #3
0
 def get_feature(self):
     ds = DataSet(path=self.path, name=self.name)
     data = ds.preprocess(self.lemmatize, self.remove_stop,
                          self.remove_punc)
     cue_words_list = self.get_cue_words()
     X = []
     for index, row in data.iterrows():
         X_row = []
         for word in cue_words_list:
             if word in row['Headline']:
                 X_row.append(1)
             else:
                 X_row.append(0)
         for word in cue_words_list:
             if word in row['Body']:
                 X_row.append(1)
             else:
                 X_row.append(0)
         X.append(X_row)
     return np.array(X)
Example #4
0
 def nGramMathing(self):
     ds = DataSet(path=self.path, name=self.name)
     data = ds.preprocess(self.lemmatize, self.remove_stop,
                          self.remove_punc)
     idf = self.getIDF(data["Body"].to_numpy())
     features = []
     for index, row in data.iterrows():
         H = []
         A = []
         for n in range(1, 6):
             H_ngram = self.get_ngram(n, row['Headline'])
             A_ngram = self.get_ngram(n, row["Body"])
             H.extend(list(H_ngram.keys()))
             A.extend(list(A_ngram.keys()))
         sum = 0
         for i, h in enumerate(H):
             TF_hi = (H.count(h) + A.count(h)) * len(h)
             idf_hi = idf.get(" ".join(h), 0)
             sum += (TF_hi * idf_hi)
         sc = sum / (len(H) + len(A))
         features.append(sc)
     return np.array(features).reshape(-1, 1)