def process(text): p = PorterStemmer() doc_nor = text.lower() doc_sw = remove_stopwords(doc_nor) doc_stem = p.stem_sentence(doc_sw) stem = re.findall(r'[\w]+', doc_stem) return doc_stem.split()
class Word2VecPipeline(object): def open_spider(self, spider): # Create an empty model w2v = gensim.models.Word2Vec([['seo']], min_count=1) self.name = '/tmp/Word2Vec' + str(time.time()) # Save it w2v.save(self.name) self.p = PorterStemmer() self.stop_words = set(stopwords.words('french')) def process_item(self, item, spider): if 'title' in item: # This time, we don't update the item, instead we build the model. document = item.get('title') + ' ' + item.get('body') words = [ word_tokenize(self.p.stem_sentence(s)) for s in sent_tokenize(document) ] # Load current model w2v = gensim.models.Word2Vec.load(self.name) # Train our model w2v.build_vocab(words, update=True) w2v.train(words, total_examples=w2v.corpus_count, epochs=w2v.iter) # Save it for the next item w2v.save(self.name) return item
def __iter__(self): p = PorterStemmer() for entry in scandir("./dblpfiledir"): with open(entry.path, "r", encoding="utf-8") as f: jsoncontent = json.load(f) doc = jsoncontent["abstract"] if len(doc) > 0: doc = remove_stopwords(doc) doc = p.stem_sentence(doc) words = simple_preprocess(doc, deacc=True) yield TaggedDocument(words=words, tags=[jsoncontent['index']])
def __iter__(self): p = PorterStemmer() for index, row in self.train_data.iterrows(): name = row['ScriptLink'] with open('./movie_scripts/' + name) as file: #print("Im here") script = file.readlines() script = "".join(script) script = remove_stopwords(script) script = p.stem_sentence(script) words = simple_preprocess(script) yield TaggedDocument(words=words, tags=[index])
class TextCleaner(): """ A class that cleans up text data """ def __init__(self): # load stop words self.stop_words = stop_words.get_stop_words("en") # prepare stemmer self.stemmer = PorterStemmer() def clean(self, text): # remove non-alphanumerical letters text = re.sub("[^a-zA-Z]+", " ", text) # stem words text = self.stemmer.stem_sentence(text) return text
label from sklearn.model_selection import train_test_split review_train, review_test, label_train, label_test = train_test_split( review, label, test_size=0.2, shuffle=True) import numpy as np from gensim.parsing.porter import PorterStemmer from gensim.parsing.preprocessing import remove_stopwords porterStemmer = PorterStemmer() rev_train_texts = [] for text in review_train: removed_text = remove_stopwords(text) removed_text = porterStemmer.stem_sentence(removed_text) rev_train_texts.append(removed_text) rev_train_texts = np.array(rev_train_texts, dtype='O') rev_test_texts = [] for text in review_test: removed_text = remove_stopwords(text) removed_text = porterStemmer.stem_sentence(removed_text) rev_test_texts.append(removed_text) rev_test_texts = np.array(rev_test_texts, dtype='O') from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences
class DatasetParser: def __init__(self): # info self.cluster_info = dict() self.article_info = dict() if config_meta['word_tokenizer'] == 'bert': self.word_tokenize = config.bert_tokenizer.tokenize elif config_meta['word_tokenizer'] == 'nltk': self.word_tokenize = nltk.tokenize.word_tokenize else: raise ValueError('Invalid word_tokenizer: {}'.format( config_meta['word_tokenizer'])) self.sent_tokenize = nltk.tokenize.sent_tokenize self.porter_stemmer = PorterStemmer() if config_meta['texttiling']: self.para_tokenize = TextTilingTokenizer() # base pat BASE_PAT = '(?<=<{0}> )[\s\S]*?(?= </{0}>)' BASE_PAT_WITH_NEW_LINE = '(?<=<{0}>\n)[\s\S]*?(?=\n</{0}>)' BASE_PAT_WITH_RIGHT_NEW_LINE = '(?<=<{0}>)[\s\S]*?(?=\n</{0}>)' # query pat self.id_pat = re.compile(BASE_PAT.format('num')) self.title_pat = re.compile(BASE_PAT.format('title')) self.narr_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('narr')) # article pat self.text_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('TEXT')) self.graphic_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('GRAPHIC')) self.type_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('TYPE')) self.para_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('P')) self.proc_params_for_questions = { 'rm_dialog': False, 'rm_stop': False, 'stem': True, } def _get_word_ids(self, words): word_ids = config.bert_tokenizer.convert_tokens_to_ids(words) return word_ids def _proc_sent(self, sent, rm_dialog, rm_stop, stem, rm_short=None, min_nw_sent=3): sent = sent.lower() sent = re.sub(r'\s+', ' ', sent).strip() # remove extra spaces if not sent: return None if rm_short and len(nltk.tokenize.word_tokenize(sent)) < min_nw_sent: return None if rm_dialog: dialog_tokens = ["''", "``"] for tk in dialog_tokens: if tk in sent: logger.info('Remove dialog') return None if config.test_year == '2005' and sent[0] == "'" and ( 'says' in sent or 'said' in sent): logger.info('Remove dialog') return None if rm_stop: sent = remove_stopwords(sent) if stem: sent = self.porter_stemmer.stem_sentence(sent) return sent def _proc_para(self, pp, rm_dialog=True, rm_stop=True, stem=True, to_str=False): """ Return both original paragraph and processed paragraph. :param pp: :param rm_dialog: :param rm_stop: :param stem: :param to_str: if True, concatenate sentences and return. :return: """ original_para_sents, processed_para_sents = [], [] for ss in self.sent_tokenize(pp): ss_origin = self._proc_sent(ss, rm_dialog=False, rm_stop=False, stem=False) ss_proc = self._proc_sent(ss, rm_dialog=rm_dialog, rm_stop=rm_stop, stem=stem) if ss_proc: # make sure the sent is not removed, i.e., is not empty and is not in a dialog original_para_sents.append(ss_origin) processed_para_sents.append(ss_proc) if not to_str: return original_para_sents, processed_para_sents para_origin = ' '.join(original_para_sents) para_proc = ' '.join(processed_para_sents) return para_origin, para_proc def get_doc(self, fp, concat_paras): """ get an article from file. first get all natural paragraphs in the text, then: if concat_paras, return paragraphs joint by \n; if using texttiling, return subtopic tiles; if not above, return paragraphs. """ with io.open(fp, encoding='utf-8') as f: article = f.read() pats = [self.text_pat, self.graphic_pat] PARA_SEP = '\n\n' for pat in pats: text = re.search(pat, article) if not text: continue text = text.group() # if there is '<p>' in text, gather them to text paras = re.findall(self.para_pat, text) if paras: text = PARA_SEP.join(paras) if concat_paras: return text # for text tiling: if paragraph break is a single '\n', double it pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*") matches = pattern.finditer(text) if not matches: text.replace('\n', PARA_SEP) if config_meta['texttiling']: try: tiles = self.para_tokenize.tokenize(text) except ValueError: # return short text a tiles tiles = [text] return tiles if paras: return paras else: return text.split(PARA_SEP) logger.warning('No article content in {0}'.format(fp)) return None def doc2sents(self, fp, para_org=False, rm_dialog=True, rm_stop=True, stem=True, rm_short=None): """ :param fp: :param para_org: bool :return: if para_org=True, 2-layer nested lists; else: flat lists. """ paras = self.get_doc(fp, concat_paras=False) original_sents, processed_sents = [], [] if not paras: return [], [] for pp in paras: original_para_sents, processed_para_sents = self._proc_para( pp, rm_dialog=rm_dialog, rm_stop=rm_stop, stem=stem) if para_org: original_sents.append(original_para_sents) processed_sents.append(processed_para_sents) else: original_sents.extend(original_para_sents) processed_sents.extend(processed_para_sents) return original_sents, processed_sents def doc2paras(self, fp, rm_dialog=True, rm_stop=True, stem=True): paras = self.get_doc(fp, concat_paras=False) if not paras: return [], [] original_paras, processed_paras = [], [] for pp in paras: original_para_sents, processed_para_sents = self._proc_para( pp, rm_dialog=rm_dialog, rm_stop=rm_stop, stem=stem) para_origin = ' '.join(original_para_sents) para_proc = ' '.join(processed_para_sents) original_paras.append(para_origin) processed_paras.append(para_proc) return original_paras, processed_paras def cid2sents(self, cid, rm_dialog=True, rm_stop=True, stem=True, max_ns_doc=None): """ Load all sentences in a cluster. :param cid: :param rm_dialog: :param rm_stop: :param stem: :param max_ns_doc: :return: a 2D list. """ original_sents, processed_sents = [], [] doc_ids = tools.get_doc_ids( cid, remove_illegal=rm_dialog) # if rm dialog, rm illegal docs. for did in doc_ids: doc_fp = tools.get_doc_fp(did) # 2d if para_org==True; 1d otherwise. original_doc_sents, processed_doc_sents = dataset_parser.doc2sents( fp=doc_fp, para_org=config_meta['para_org'], rm_dialog=rm_dialog, rm_stop=rm_stop, stem=stem) if max_ns_doc: original_doc_sents = original_doc_sents[:max_ns_doc] processed_doc_sents = processed_doc_sents[:max_ns_doc] original_sents.append(original_doc_sents) processed_sents.append(processed_doc_sents) return original_sents, processed_sents def cid2sents_tdqfs(self, cid): cc_dp = join(path_parser.data_tdqfs_sentences, cid) fns = [fn for fn in listdir(cc_dp)] original_sents, processed_sents = [], [] for fn in fns: sentences = [ ss.strip('\n') for ss in io.open(join(cc_dp, fn)).readlines() ] original_doc_sents, processed_doc_sents = [], [] for ss in sentences: ss_origin = self._proc_sent(ss, rm_dialog=False, rm_stop=False, stem=False) ss_proc = self._proc_sent(ss, rm_dialog=False, rm_stop=True, stem=True) if ss_proc: original_doc_sents.append(ss_origin) processed_doc_sents.append(ss_proc) original_sents.append(original_doc_sents) processed_sents.append(processed_doc_sents) return original_sents, processed_sents def cid2paras(self, cid, rm_dialog=True, rm_stop=True, stem=True, max_np_doc=None): original_paras, processed_paras = [], [] doc_ids = tools.get_doc_ids( cid, remove_illegal=rm_dialog) # if rm dialog, rm illegal docs. for did in doc_ids: doc_fp = tools.get_doc_fp(did) original_doc_paras, processed_doc_paras = dataset_parser.doc2paras( fp=doc_fp, rm_dialog=rm_dialog, rm_stop=rm_stop, stem=stem) if max_np_doc: original_doc_paras = original_doc_paras[:max_np_doc] processed_doc_paras = processed_doc_paras[:max_np_doc] original_paras.append(original_doc_paras) processed_paras.append(processed_doc_paras) return original_paras, processed_paras def parse(self, para, clip_and_mask, offset, rm_dialog, rm_stop, stem): """ parse a para and organize results by words. """ sents = [ self.word_tokenize(self._proc_sent(sent, rm_dialog, rm_stop, stem)) for sent in self.sent_tokenize(para) ] if not clip_and_mask: # you only index after clipped return sents return clip_and_mask(sents, offset, join_query_para=config.join_query_para) def parse_query(self, query): """ parse a query string => a dict with keys: ('words', 'sent_mask'). """ if 'max_nw_query' not in config_model: raise ValueError('Specify max_nw_query in config to clip query!') return self.word_tokenize(query)[:config_model['max_nw_query']] def parse_doc2sents(self, fp): """ From file, parse a doc => a dict with keys: {'sents', 'doc_masks'}. The value of 'sents': 2D nested list; each list consists of (clipped) sentence words. """ _, processed_sents = self.doc2sents(fp, para_org=False, rm_dialog=True, rm_stop=False, stem=False) if not processed_sents: return None sents = [self.sent2words(sent) for sent in processed_sents] res = cm_sl.clip_and_mask_doc_sents(sents=sents) return res def sent2words(self, sent): """ tokenize the given proprocessed sent. :param sent: :return: """ return self.word_tokenize(sent) def parse_rel_sents_file(self, rel_sents_fp, rm_dialog=True, rm_stop=True, stem=True): """ """ original_sents, processed_sents = [], [] with io.open(rel_sents_fp, encoding='utf-8', mode='r') as relv_sents_f: sents = [sent.strip('\n') for sent in relv_sents_f.readlines()] for sent in sents: ss_proc = self._proc_sent(sent, rm_dialog=rm_dialog, rm_stop=rm_stop, stem=stem) if ss_proc: # make sure the sent is not removed, i.e., is not empty and is not in a dialog original_sents.append(sent) processed_sents.append(ss_proc) return original_sents, processed_sents def parse_summary(self, fp): sent_as_line = fp.split('/')[-2] != '2007' with io.open(fp, encoding='latin1') as f: content = f.readlines() lines = [ll.rstrip('\n') for ll in content] if sent_as_line: return lines sents = list(itertools.chain(*[self.sent_tokenize(ll) for ll in lines])) return sents def build_query_info(self, year, tokenize_narr, concat_title_narr=False, proc=True): fp = join(path_parser.data_topics, '{}.sgml'.format(year)) with io.open(fp, encoding='utf-8') as f: article = f.read() segs = article.split('\n\n\n') query_info = dict() for seg in segs: seg = seg.rstrip('\n') if not seg: continue query_id = re.search(self.id_pat, seg) title = re.search(self.title_pat, seg) narr = re.search(self.narr_pat, seg) if not query_id: logger.info('no query id in {0} in {1}...'.format(seg, year)) break if not title: raise ValueError('no title in {0}...'.format(seg)) if not narr: raise ValueError('no narr in {0}...'.format(seg)) query_id = query_id.group() title = title.group() narr = narr.group() # containing multiple sentences if proc: title = self._proc_sent(sent=title, rm_dialog=False, rm_stop=False, stem=True) if not title: raise ValueError('no title in {0}...'.format(seg)) if tokenize_narr: narr = sent_tokenize(narr) if type(narr) != list: narr = [narr] if proc: narr = [ self._proc_sent(sent=narr_sent, **self.proc_params_for_questions) for narr_sent in narr ] elif proc: narr = self._proc_sent(sent=narr, **self.proc_params_for_questions) if not narr: raise ValueError('no narr in {0}...'.format(seg)) cid = config.SEP.join((year, query_id)) if not concat_title_narr: query_info[cid] = { config.TITLE: title, config.NARR: narr, # str or list } continue # concat title and narr if tokenize_narr: # list narr.insert(0, title) # narr is a list query_info[cid] = narr else: # str sep = '. ' if title.endswith('.'): sep = sep[-1] title = 'describe ' + title query_info[cid] = sep.join((title, narr)) return query_info def get_cid2query(self, tokenize_narr): query_dict = dict() for year in config.years: annual_dict = self.build_query_info(year, tokenize_narr, concat_title_narr=True) query_dict = { **annual_dict, **query_dict, } return query_dict def get_cid2title(self): title_dict = dict() for year in config.years: annual_dict = self.build_query_info(year, tokenize_narr=False, concat_title_narr=False) for cid in annual_dict: annual_dict[cid] = annual_dict[cid][config.TITLE] title_dict = { **annual_dict, **title_dict, } return title_dict def get_cid2narr(self): title_dict = dict() for year in config.years: annual_dict = self.build_query_info(year, tokenize_narr=False, concat_title_narr=False) for cid in annual_dict: annual_dict[cid] = annual_dict[cid][config.NARR] title_dict = { **annual_dict, **title_dict, } return title_dict
X = pd.read_csv("politifact.csv", ",", encoding="ISO-8859-1") X_text = X['text'].values print("Read") p = PorterStemmer() for i in range(len(X_text)): words = X_text[i].split() filtered_list = [] for word in words: pattern = re.compile( '[^\u0000-\u007F]+', re.UNICODE) #Remove all non-alphanumeric characters word = pattern.sub('', word) word = word.translate(str.maketrans('', '', string.punctuation)) filtered_list.append(word) result = ' '.join(filtered_list) X_text[i] = result X_text[i] = p.stem_sentence(X_text[i]) list1 = [['id', 'text', 'label']] for i in range(0, len(X_text)): list1.append([X['id'][i], X_text[i], X['label'][i]]) df1 = pd.DataFrame(list1) df1.to_csv('stemmed_politifact.csv', sep=',', index=False, header=False)
stopwords.add("say") stopwords.add("says") stopwords.add("will") stopwords.add("just") #Transform Category le = preprocessing.LabelEncoder() le.fit(train_data["Category"]) y_train = le.transform(train_data["Category"]) texts = [] #Stemming for whichtexts in train_data['Content']: whichtexts = whichtexts.encode("ascii", errors="ignore") whichtexts = p.stem_sentence(whichtexts) texts.append(whichtexts) #Vectorazation count_vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS) X_train = count_vectorizer.fit_transform(texts) #LSI svd = TruncatedSVD(n_components=107) X_train = svd.fit_transform(X_train) clf = myKNNClassifier(1) y_predict = cross_val_predict(clf, X_train, y_train, cv=10) print "accuracy:", metrics.accuracy_score(y_train, y_predict)
stopwords2 = set(STOPWORDS) stopwords.union(stopwords2) stopwords.add("said") stopwords.add("say") stopwords.add("says") stopwords.add("will") stopwords.add("just") #Transform Category le = preprocessing.LabelEncoder() le.fit(train_data["Category"]) y_train = le.transform(train_data["Category"]) #Stemming for i in range(0, 10000): train_data.loc[[i], ['Content']] = p.stem_sentence( train_data.loc[[i], ['Content']].to_string(header=False, index=False)) #Vectorazation count_vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS) X_train = count_vectorizer.fit_transform(train_data['Content']) #LSI svd = TruncatedSVD(n_components=107) X_train = svd.fit_transform(X_train) clf = RandomForestClassifier(n_estimators=1000, max_depth=10) y_predict = cross_val_predict(clf, X_train, y_train, cv=10) print "accuracy:", metrics.accuracy_score(y_train, y_predict) print "f_measure:", metrics.f1_score(y_train, y_predict, average='macro') print "precision:", metrics.precision_score(y_train,
# Bagi train set dan test set from sklearn.model_selection import train_test_split train_texts, test_texts, train_label, test_label = train_test_split(texts, label, test_size=0.2) # Stopword removal dan porter steemming teks import numpy as np from gensim.parsing.porter import PorterStemmer from gensim.parsing.preprocessing import remove_stopwords porterStemmer = PorterStemmer() fixed_train_texts = [] for text in train_texts: removed_text = remove_stopwords(text) # Stopword removal removed_text = porterStemmer.stem_sentence(removed_text) # Port steemming fixed_train_texts.append(removed_text) fixed_train_texts = np.array(fixed_train_texts, dtype='O') # Train set fixed_test_texts = [] for text in test_texts: removed_text = remove_stopwords(text) removed_text = porterStemmer.stem_sentence(removed_text) # Port steemming fixed_test_texts.append(removed_text) fixed_test_texts = np.array(fixed_test_texts, dtype='O') # Test set print(f'text ({len(train_texts[0])}): {train_texts[0]}') print(f'removed ({len(fixed_train_texts[0])}): {fixed_train_texts[0]} ({len(fixed_train_texts[0])})')
filtered_list = [] for word in words: pattern = re.compile('[\W_]+', re.UNICODE) #Remove all non-alphanumeric characters word = pattern.sub('', word) word = word.translate(str.maketrans('', '', string.punctuation)) filtered_list.append(word) result = ' '.join(filtered_list) X_title[i] = result words = X_text[i].split() filtered_list = [] for word in words: pattern = re.compile('[\W_]+', re.UNICODE) #Remove all non-alphanumeric characters word = pattern.sub('', word) word = word.translate(str.maketrans('', '', string.punctuation)) filtered_list.append(word) result = ' '.join(filtered_list) X_text[i] = result #if there is no text, use title X_title[i] = p.stem_sentence(X_title[i]) X_text[i] = p.stem_sentence(X_text[i]) list1 = [['id','title', 'text', 'subject', 'date', 'label']] for i in range(0, len(X_text)): list1.append([X['id'][i], X_title[i], X_text[i], X['subject'][i], X['date'][i], X['label'][i]]) df1 = pd.DataFrame(list1) df1.to_csv('stemmed_ISOT_dataset.csv',sep=',',index = False ,header = False)