def tokenize(text): docs = nlp(text) tokens = [token.text for token in docs] punctuation = list(punctuation) punctuation.append('\n') tokens = [token for token in tokens if token not in punctuation] return tokens
def save_article(self): from string import punctuation # print(punctuation) punctuation=list(punctuation) punctuation.append('\n') # for sent in self.ParseText: # print(sent) test=self.ParseText.split(' ') print(test) # tokens=[ token for token in self.ParseText if token not in punctuation ] # print(tokens) # punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~''\“\”/' # for sentence in self.ParseText.sentences: # for ele in sentence: # if ele in punc: # sentence=sentence.replace(ele,"") # print(sentence) # row_contents=[f"{self.get_article_title()}",f"{self.get_article_author()}",f"{self.get_article_date()}",f"{self.get_article_summary()}",f"{self.get_article_tags()}"] # with open(self.file_name,'a+',newline='') as write_obj: # csv_writter=writer(write_obj) # csv_writter.writerow(row_contents) # print(row_contents) pass
def classify_article(self, articles_filename): # Load Classifier: idf_liwc_article, idf_punc_article, idf_liwc_title, idf_punc_title = load_dataset("train_idf_by_publisher.pkl") classifier = load("bypublisher_classification_model.pkl") svd = load("bypublisher_svd_model.pkl") tfidf = load("bypublisher_tfidf_model.pkl") for article in self.efficient_read_article_text(articles_filename): for feature, words in self.liwc_features.items(): liwc_in_article = 0 liwc_in_title = 0 for word in words: counts_articles = article.text.count(word) counts_title = article.title.count(word) liwc_in_article += counts_articles liwc_in_title += counts_title article.liwc_counts[feature] += counts_articles article.liwc_counts_title[feature] += counts_title article.all_liwc += counts_articles article.all_liwc_title += counts_title for feature, words in self.punctuations.items(): punc_in_article = 0 punc_in_title = 0 for word in words: counts_articles = article.text.count(word) counts_title = article.title.count(word) punc_in_article += counts_articles punc_in_title += counts_title article.punctuation_counts[feature] += counts_articles article.punctuation_counts_title[feature] += counts_title article.all_punc += counts_articles article.all_punc_title += counts_title # Prepare features unigrams = tfidf.transform([" ".join(article.clean_article())]) liwc = [] punctuation = [] structure = [article.count_quotes, article.count_paragraphs, article.count_urls] for feature in self.liwc_features.keys(): tf_article = 0 if article.all_liwc != 0: tf_article = article.liwc_counts[feature] / article.all_liwc liwc.append(tf_article * idf_liwc_article[feature]) for feature in self.punctuations.keys(): tf_article = 0 if article.all_punc != 0: tf_article = article.punctuation_counts[feature] / article.all_punc if article.all_punc_title != 0: tf_title = article.punctuation_counts_title[feature] / article.all_punc_title punctuation.append(tf_article * idf_punc_article[feature]) test_article = hstack([unigrams, [liwc], [punctuation], [structure]]) test_article = svd.transform(test_article) # remove svd # Classify Article clf_pred = classifier.predict(test_article)[0] prediction = ("true" if clf_pred == 1 else "false") confidence = 0.0 # Output prediction self.outFile.write(article.id + " " + prediction + " " + str(confidence) + "\n")
turn = [] speaker = [] pss = [] for i, val in enumerate(z): x = z[i] t = x[0] s = x[1] m = x[2] m = m.split(' ') p = [] #w = [] for n in m: temp = n.split('|') p.append(temp[0]) #w.append(temp[1]) turn.append(t) speaker.append(s) pss.append(p) #wds.append(w) pw = list(zip(turn,speaker,pss)) lines = [] for i in pw: num = str(i[0]) speaker = i[1] #wds = i[3] pss = i[2] for j, val in enumerate(pss):
def POS(pathin, file): import os from string import punctuation as p os.chdir(pathin) removelist = ['\t', '\r'] with open(file, 'r') as f: text = f.read() for item in removelist: text = text.replace(item, '') text = text.split('@') #print(text) for line in text: if line.startswith('Participants:'): p = line.replace('Participants:', '') p = p.split(',') p1 = [] for i, val in enumerate(p): v1 = val.replace(' ', '') v2 = v1.replace('\n', '') p1.append(v2[0:3]) people = p1 with open(file, 'r') as f: text = f.read() for item in removelist: text = text.replace(item, '') text = text.split('\n') trans = [] for line, val in enumerate(text): label = val[1:4] if label in people and text[line + 1].startswith('%mor'): trans.append(val) elif label == 'mor': trans.append(val) grouped = [trans[n:n + 2] for n in range(0, len(trans), 2)] p = '.!?' turn = [] speaker = [] mor = [] for i, val in enumerate(grouped): turn.append(i + 1) for i in grouped: s = i[0] m = i[1] for c in p: m = m.replace(c, '') m = m.strip() m = m.replace('%mor:', '') speaker.append(s[1:4]) mor.append(m) z = list(zip(turn, speaker, mor)) turn = [] speaker = [] pss = [] for i, val in enumerate(z): x = z[i] t = x[0] s = x[1] m = x[2] m = m.split(' ') p = [] #w = [] for n in m: temp = n.split('|') p.append(temp[0]) #w.append(temp[1]) turn.append(t) speaker.append(s) pss.append(p) #wds.append(w) pw = list(zip(turn, speaker, pss)) lines = [] for i in pw: num = str(i[0]) speaker = i[1] #wds = i[3] pss = i[2] for j, val in enumerate(pss): p = pss[j] #w = wds[j] #temp = num + ',' + speaker + ',' + w + ',' + p + '\n' temp = num + ',' + speaker + ',' + p + '\n' temp = num + ',' + speaker + ',' + ',' + p + '\n' temp = num + ',' + speaker + ',' + p + '\n' lines.append(temp) #header = 'Turn,' + 'Speaker,' + 'Word' + 'POS' + '\n' header = 'Turn,' + 'Speaker,' + 'POS' + '\n' data = ''.join(lines) output = header + data return (output)
stopWords = list(STOP_WORDS) nlp = spacy.load('en_core_web_sm') # print(stopWords) docs = nlp(text) # print(docs) tokens = [token.text for token in docs] # print(tokens) punctuation = list(punctuation) # print(punctuation) punctuation.append('\n') # print(punctuation) # punctuation=punctuation +'\n' # print(punctuation) tokens = [token for token in tokens if token not in punctuation] word_frequency = {} for word in docs: if word.text.lower() not in stopWords: if (word.text.lower() not in punctuation): if word.text not in word_frequency.keys(): word_frequency[word.text] = 1 else: word_frequency[word.text] += 1
def classify_article(self, articles_filename): # Load Classifier: idf_liwc_article, idf_punc_article, idf_liwc_title, idf_punc_title = load_dataset("train_idf_by_articles.pkl") classifier = load("byarticle_classification_model.pkl") tfidf = load("byarticle_article_tfidf_model.pkl") tfidf_title = load("byarticle_title_tfidf_model.pkl") for article in self.efficient_read_article_text(articles_filename): article_emotions = np.sum(list(map(self.emotions_word, article.text.split(" "))), axis=0) for feature, words in self.liwc_features.items(): liwc_in_article = 0 liwc_in_title = 0 for word in words: counts_articles = article.text.count(word) counts_title = article.title.count(word) liwc_in_article += counts_articles liwc_in_title += counts_title article.liwc_counts[feature] += counts_articles article.liwc_counts_title[feature] += counts_title article.all_liwc += counts_articles article.all_liwc_title += counts_title for feature, words in self.punctuations.items(): punc_in_article = 0 punc_in_title = 0 for word in words: counts_articles = article.text.count(word) counts_title = article.title.count(word) punc_in_article += counts_articles punc_in_title += counts_title article.punctuation_counts[feature] += counts_articles article.punctuation_counts_title[feature] += counts_title article.all_punc += counts_articles article.all_punc_title += counts_title # Prepare features unigrams = tfidf.transform([" ".join(article.clean_article())]) title_unigrams = tfidf_title.transform([article.title]) liwc = [] punctuation = [] structure = [article.count_quotes, article.count_paragraphs, article.count_urls] for feature in self.liwc_features.keys(): tf_article = 0 if article.all_liwc != 0: tf_article = article.liwc_counts[feature] / article.all_liwc liwc.append(tf_article * idf_liwc_article[feature]) for feature in self.punctuations.keys(): tf_article = 0 if article.all_punc != 0: tf_article = article.punctuation_counts[feature] / article.all_punc if article.all_punc_title != 0: tf_title = article.punctuation_counts_title[feature] / article.all_punc_title punctuation.append(tf_article * idf_punc_article[feature]) test_article = hstack([unigrams, title_unigrams, liwc, punctuation, structure, article_emotions]) # Classify Article clf_pred = classifier.predict(test_article)[0] prediction = ("true" if clf_pred == 1 else "false") # confidence = max(classifier.predict_proba(test_article)[0]) confidence = 0.0 # Output prediction print(article.id + " " + prediction + " " + str(confidence)) self.outFile.write(article.id + " " + prediction + " " + str(confidence) + "\n")
def get_punctuation(self): from string import punctuation punctuation = list(punctuation) punctuation.append('\n') return punctuation