def get_feature(self): ds = DataSet(self.name, self.path) data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc, self.sent) vocabulary = set() headline_body = [] for i, row in data.iterrows(): vocabulary.update(row['Headline'].split(' ')) vocabulary.update(row['Body'].split(' ')) headline_body.append(row['Headline'] + ' ' + row['Body']) headlines = data.Headline.to_numpy() bodies = data.Body.to_numpy() if self.name == 'train': vectorizer = TfidfVectorizer(vocabulary=vocabulary) headline_body_TF_IDF = vectorizer.fit(headline_body) with open('../feature_files/headline_body_tfidf_vectorizer.pkl', 'wb') as f: pickle.dump(headline_body_TF_IDF, f) headline_TF_IDF = headline_body_TF_IDF.transform(headlines) body_TF_IDF = headline_body_TF_IDF.transform(bodies) else: headline_body_TF_IDF = pickle.load(open('../feature_files/headline_body_tfidf_vectorizer.pkl', 'rb')) headline_TF_IDF = headline_body_TF_IDF.transform(headlines) body_TF_IDF = headline_body_TF_IDF.transform(bodies) features = [] for h, b in zip(headline_TF_IDF, body_TF_IDF): features.append(sklearn.metrics.pairwise.cosine_similarity(h, b)[0][0]) return np.array(features)
def get_feature(self): def get_sentiment(d): return list(sid.polarity_scores(d).values()) ds = DataSet(path=self.path, name=self.name) data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc, self.sent) sid = SentimentIntensityAnalyzer() sentiments = [] for index, row in data.iterrows(): headline_sentiment = get_sentiment(row['Headline']) body_sentiment = get_sentiment(row['Body']) sentiments.append([headline_sentiment + body_sentiment]) return np.array(sentiments).reshape(-1, 8)
def get_feature(self): ds = DataSet(path=self.path, name=self.name) data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc) cue_words_list = self.get_cue_words() X = [] for index, row in data.iterrows(): X_row = [] for word in cue_words_list: if word in row['Headline']: X_row.append(1) else: X_row.append(0) for word in cue_words_list: if word in row['Body']: X_row.append(1) else: X_row.append(0) X.append(X_row) return np.array(X)
def nGramMathing(self): ds = DataSet(path=self.path, name=self.name) data = ds.preprocess(self.lemmatize, self.remove_stop, self.remove_punc) idf = self.getIDF(data["Body"].to_numpy()) features = [] for index, row in data.iterrows(): H = [] A = [] for n in range(1, 6): H_ngram = self.get_ngram(n, row['Headline']) A_ngram = self.get_ngram(n, row["Body"]) H.extend(list(H_ngram.keys())) A.extend(list(A_ngram.keys())) sum = 0 for i, h in enumerate(H): TF_hi = (H.count(h) + A.count(h)) * len(h) idf_hi = idf.get(" ".join(h), 0) sum += (TF_hi * idf_hi) sc = sum / (len(H) + len(A)) features.append(sc) return np.array(features).reshape(-1, 1)