def _read_csv(self, pathFilesF, pathFilesM): ''' ''' textProcessing = TextProcessing() samplesInClasses = {} samplesInClasses["female"] = [] samplesInClasses["male"] = [] samplesF = self._read_files(pathFilesF) samplesM = self._read_files(pathFilesM) iText = 0 totalText = len(samplesM) + len(samplesF) for sample in samplesF: nTokens = len(textProcessing.tokenize_one(sample)) if nTokens > 1: #print("Texto " + str(iText) + " / " + str(totalText)) samplesInClasses["female"].append(sample) iText = iText + 1 #print(metaAttributes.all_meta_attributes) #input('-----------------------------') for sample in samplesM: nTokens = len(textProcessing.tokenize_one(sample)) if nTokens > 1: #print("Texto " + str(iText) + " / " + str(totalText)) samplesInClasses["male"].append(sample) iText = iText + 1 return samplesInClasses
def search_by_type_name(self, pathFiles, targetTypes, targetNames): textProcessing = TextProcessing() from models.text_analysis.nlp_basics.string_analysis import StringAnalysis stringAnalysis = StringAnalysis() filesFound = [] nNames = len(targetNames) for root, dirs, files in os.walk(pathFiles): #Verificar se existe arquivo e se é do tipos alvo if len(files) > 0: for file in files: tokens = textProcessing.tokenize_one(file.replace( ".", " ")) if tokens[len(tokens) - 1] in targetTypes: #Verificar os nomes vFound = numpy.zeros(nNames) for iName in range(0, nNames): fileName = textProcessing.text_lower_one( [tokens[0]])[0] name = textProcessing.text_lower_one( [targetNames[iName]])[0] dist = stringAnalysis.string_in_string_dist( fileName, name) if dist == 1: vFound[iName] = 1 '''Somente operação AND''' if numpy.sum(vFound) == nNames: filesFound.append(root + "/" + file) #print(len(filesFound)) #input('------------------') return filesFound
def __init__(self, dir=''): self.dir = dir self.dictionary = Dictionary.load(dir + 'myDictionary') self.tp = TextProcessing(dir=dir) self.size = 100
def __init__(self, file_in='', file_out='', dir='', n_docs=-1): self.file_in = file_in self.file_out = file_out self.n_docs = n_docs self.tp = TextProcessing(dir=dir) self.process_corpus()
def __init__(self, corpus_file, n_docs=-1): self.corpus_file = corpus_file self.n_docs = n_docs self.tp = TextProcessing(dir='') self.dictionary = Dictionary('') self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+') self.en_stop = get_stop_words('en') self.p_stemmer = PorterStemmer()
def search_by_type(self, pathFiles, targetTypes): textProcessing = TextProcessing() filesFound = [] for root, dirs, files in os.walk(pathFiles): #Verificar se existe arquivo e se é do tipos alvo if len(files) > 0: for file in files: tokens = textProcessing.tokenize_one(file.replace( ".", " ")) if tokens[len(tokens) - 1] in targetTypes: filesFound.append(root + "/" + file) return filesFound
def __init__(self, dir='', load_dict=False): self.dir = dir self.tp = TextProcessing(dir=self.dir) # create empty dictionary: #self.dictionary = Dictionary() self.dictionary = Dictionary.load(dir + 'myDictionary') self.save_dict = True self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+') self.en_stop = get_stop_words('en') self.p_stemmer = PorterStemmer()
def create_training_data(): data_lst = pickle.load(open('data/harvest.data', 'rb')) feature_process.feature_map['source'] = {'Google':1, 'Twitter for iPad':2, 'Echofon':3, 'Bitly':4, 'twitterfeed':5, 'Twitter for iPhone':6, 'Foursquare':7, 'Facebook':8, 'Twitter for Android':9, 'TweetDeck':10, 'Twitter Web Client':11} feature_process.feature_map['geo'] = ['None'] feature_process.feature_map['place'] = ['None'] feature_process.feature_map['verified'] = ['False'] feature_process.feature_map['geo_enabled'] = ['False'] y = [] x = [] for i in range(0, len(data_lst)): try: label = is_not_important[i] except Exception as e: label = 1 data = data_lst[i] text = TextProcessing.process(data[0]) source = FeatureMapping.mapping('source', data[1]) re_tweet = data[2] geo = FeatureMapping.mapping_other('geo', data[3]) place = FeatureMapping.mapping_other('place', data[4]) hash_tag = data[5] media = data[6] verified = FeatureMapping.mapping_other('verified', data[7]) follower = data[8] statues = data[9] desc = TextProcessing.process(data[10]) friend = data[11] location = TextProcessing.process(data[12]) geo_enabled = FeatureMapping.mapping_other('geo_enabled', data[13]) y.append(label) x.append([text, source, re_tweet, geo, place, hash_tag, media, verified, follower, statues, desc, friend, location, geo_enabled]) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score, accuracy_score clf = RandomForestClassifier() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) fsc = f1_score(y_test, y_pred) acc = accuracy_score(y_test, y_pred) print 'f1-score : ',fsc print 'accuracy : ',acc print y_pred print y_test
class PreprocessCorpusFile: def __init__(self, file_in='', file_out='', dir='', n_docs=-1): self.file_in = file_in self.file_out = file_out self.n_docs = n_docs self.tp = TextProcessing(dir=dir) self.process_corpus() def process_corpus(self): fin = open(self.file_in, 'r') fout = open(self.file_out, 'w') i = 0 for line in fin.readlines(): # cleaning the line stemmed_tokens = self.tp.clean_line(line) # write to file fout.write(' '.join(stemmed_tokens)) fout.write('\n') i += 1 if self.n_docs != -1 and i >= self.n_docs: break if i % 1000 == 0: logging.debug('Sentence %s processed' % i) # convert tokenized documents into a document-term matrix fin.close() fout.close()
class Sentences: def __init__(self, corpus_file, n_docs=-1): self.corpus_file = corpus_file self.n_docs = n_docs self.tp = TextProcessing(dir='') self.dictionary = Dictionary('') self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+') self.en_stop = get_stop_words('en') self.p_stemmer = PorterStemmer() def __iter__(self, dict_dir): logging.info("Loading corpus in file %s" % self.corpus_file) i = 0 for line in open(self.corpus_file, 'r'): # cleaning the line stemmed_tokens = self.tp.clean_line(line) # add tokens to list #ret.append(stemmed_tokens) # add line to dictionary d2 = Dictionary(stemmed_tokens) self.dictionary = self.dictionary.merge_with(d2) # count number of documents and break if > num_docs i += 1 if self.n_docs != -1 and i >= self.n_docs: break if i % 1000 == 0: logging.debug('Document %s loaded' % i)
def create_training_data(): data_lst = pickle.load(open('data/harvest.data', 'rb')) feature_process.feature_map['source'] = {'Google':1, 'Twitter for iPad':2, 'Echofon':3, 'Bitly':4, 'twitterfeed':5, 'Twitter for iPhone':6, 'Foursquare':7, 'Facebook':8, 'Twitter for Android':9, 'TweetDeck':10, 'Twitter Web Client':11} feature_process.feature_map['geo'] = ['None'] feature_process.feature_map['place'] = ['None'] feature_process.feature_map['verified'] = ['False'] feature_process.feature_map['geo_enabled'] = ['False'] y = [] x = [] for i in range(0, len(data_lst)): try: label = is_not_important[i] except Exception as e: label = 1 data = data_lst[i] text = TextProcessing.process(data[0]) source = FeatureMapping.mapping('source', data[1]) re_tweet = data[2] geo = FeatureMapping.mapping_other('geo', data[3]) place = FeatureMapping.mapping_other('place', data[4]) hash_tag = data[5] media = data[6] verified = FeatureMapping.mapping_other('verified', data[7]) follower = data[8] statues = data[9] desc = TextProcessing.process(data[10]) friend = data[11] location = TextProcessing.process(data[12]) geo_enabled = FeatureMapping.mapping_other('geo_enabled', data[13]) y.append(label) x.append([text, source, re_tweet, geo, place, hash_tag, media, verified, follower, statues, desc, friend, location, geo_enabled]) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score, accuracy_score clf = RandomForestClassifier() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) fsc = f1_score(y_test, y_pred) acc = accuracy_score(y_test, y_pred) print fsc, acc print y_pred print y_test
def _read_files(self, pathFiles): import sys textProcessing = TextProcessing() fileSearch = FileSearch() files = fileSearch.search_by_type(pathFiles, "csv") samples = [] csv.field_size_limit(sys.maxsize) for file in files: csvfile = open(file, newline='') print(file) csvreader = csv.reader(csvfile, delimiter=';', quotechar='|') #print(csvreader) for row in csvreader: nTokens = len(textProcessing.tokenize_one(' '.join(row))) if nTokens < 50: #print(row) samples.append(' '.join(row)) return samples
import os,heapq import shelve from collections import Counter from typing import Union, List, Tuple from tqdm import tqdm from utils import timer,load_wapo from text_processing import TextProcessing text_processor = TextProcessing.from_nltk() # include your customized text processing class @timer def build_inverted_index( wapo_jl_path: Union[str, os.PathLike], index_shelve_path: str ) -> None: """ load wapo_pa3.jl to build the inverted index and store the index as a shelf in the provided path :param wapo_jl_path: :param index_shelve_path: :return: """ # Note: Generating inverted index and then assigning it to shelf --> big speed improvement #---> but doing so ignores the whole point of using shelf for the index # Current iteration takes about 15-25 minutes to run with shelve.open(index_shelve_path,flag='n',writeback=True) as index: index["___count"] = Counter() #this is used for analysis in custom processing for doc in load_wapo(wapo_jl_path): normal_tokens, stops= text_processor.get_normalized_tokens(doc['title'],doc['content_str'])
def __init__(self, text): import time start = time.clock() ''' ----------------------------------------------------------------------------------------------------------------------- DEFINICAO DOS PARAMETROS DE CONTROLE ----------------------------------------------------------------------------------------------------------------------- ''' tp = TextProcessing() self.nMaxLengthFreq = 16 # OBS1: Tamanho maximo de palavra a ser considerado na frequencia do tamanho de palavras savePath = "/home/ahirton/Python/gender_classification/outputfiles/" #savePath = "/home/rpasti/workspace/gender_classification/outputfiles/" tagged = tp.tagging([tp.tokenize([text])[0]],savePath,"en")[0] fileUtils = FileUtils(savePath) text = re.sub("http","", text) self.raw = text # print tagged self.PARAGRAPHS = [] self.SENTENCES = [] self.WORDS = [] delimiters = '\n','. \n', '! \n', '?\n', '.\n', '!\n', '?\n', '... \n' #, '... \n'#, ' \n ' #, " .\n", " !\n", ' ?\n' regexPattern = '|'.join(map(re.escape, delimiters)) for paragraph in re.split(regexPattern,self.raw): p = [] # print "" # print paragraph # raw_input(".----------------.FIM DE PARÁGRAFO----------------.") #sentences = tp.tokenize_sentence([paragraph])[0] for sentence in tp.tokenize_sentence([paragraph])[0]: # print "" # print sentence # print tp.tagging(tp.tokenize([sentence])) # raw_input(".---------------..FIM DE FRASE...------.") words = tp.tokenize([sentence])[0] #words = tp.remove_punctuation([words])[0] self.WORDS.extend(words) self.SENTENCES.append(sentence) p.append(words) # print paragraph # print sentence # print words # print self.WORDS # raw_input('XXXXXXXXXXXXXXXXXXXXXXXXXXXXX') self.PARAGRAPHS.append(p) self.C = len(text) self.LOWER = MetaAttributes._count_char(text, "^[a-z_-]*$") self.UPPER = MetaAttributes._count_char(text, "^[A-Z_-]*$") self.NUMBERS = MetaAttributes._count_char(text, "^[\d]*$") self.WHITE = MetaAttributes._count_char(text, "^[ ]*$") self.TAB = MetaAttributes._count_char(text, "^[\t]*$") self.N = len(self.WORDS) self.SIZES = [] self.FREQ = {} for w in self.WORDS: self.SIZES.append(len(w)) self.FREQ = dict(nltk.FreqDist(self.WORDS)) self.V = dict(nltk.FreqDist(self.FREQ.values())) self.VRICH = self.N - len(self.V) self.HXLEGO = [] self.HXDISLEGO = [] for w, t in self.FREQ.items(): if t == 1: self.HXLEGO.append(w) elif t == 2: self.HXDISLEGO.append(w) self.TAGGED = tagged self.S = len(self.SENTENCES) self.pwdictionary = semantic_dictionaries.extended_positive() self.nwdictionary = semantic_dictionaries.extended_negative() self.neutralwdictionary = semantic_dictionaries.extended_neutral_words() self.LIWCdict = fileUtils.load_object("liwc", "dict")
from read_data import ReadData from text_processing import TextProcessing st.set_page_config(layout="wide") st.markdown("<h1 style='text-align: center; color: black;'>Multipurpose Natural Language Processing App</h1>", unsafe_allow_html=True) st.markdown(Config.hide_streamlit_style, unsafe_allow_html=True) data_choice = st.radio("Select your preferred way of data input", ('Upload a file', 'Direct text input')) if data_choice == 'Upload a file': uploaded_file = st.sidebar.file_uploader("Upload your file:", type=['txt']) read_obj = ReadData(uploaded_file) data = read_obj.read_file_txt() input_type = True else: data = st.text_input('Input your text here:') input_type = False if data is not None: model_option = st.selectbox("Please choose your intended model:", ["Text Summarization"]) process_obj = TextProcessing(data) cleaned_data = process_obj.text_cleaning(input_type)
class MyWord2Vec: def __init__(self, dir=''): self.dir = dir self.dictionary = Dictionary.load(dir + 'myDictionary') self.tp = TextProcessing(dir=dir) self.size = 100 def load_corpus(self, file_name, num_docs): texts = [] i = 0 for line in open(file_name, 'r'): # cleaning the line stemmed_tokens = self.tp.clean_line(line) # add tokens to list texts.append(stemmed_tokens) # count number of documents and break if > num_docs i += 1 if num_docs != -1 and i >= num_docs: break # convert tokenized documents into a document-term matrix return texts def train_model(self, file_name='corpus.txt', num_docs=-1, size=100): self.size = size # generate corpus #corpus = self.load_corpus(file_name, num_docs) corpus = LineSentence(file_name, limit=num_docs) # generate Word2Vec model model = Word2Vec(corpus, size=size, window=5, min_count=10, workers=3) return model def update_model(self, model, file_name, num_docs=-1): # generate new corpus corpus = self.load_corpus(file_name, num_docs) # generate Word2Vec model model.update(corpus) def get_word_embedding(self, model, word): if word in model.wv.vocab: vec = model.wv[word] else: w_clean = self.tp.clean_word(word) if w_clean in model.wv.vocab: vec = model.wv[w_clean] else: vec = np.zeros(self.size) return vec def get_sentence_embedding(self, model, line): words = self.tp.clean_line(line) vec = np.zeros(self.size) n_words = 0 for w in words: if w in model.wv: vec += model.wv[w] n_words += 1 if n_words > 0: return vec / n_words else: return vec def save_model(self, model): model.save(self.dir + 'myW2Vmodel') #self.dictionary.save('myDictionary') def load_model(self): model = Word2Vec.load(self.dir + 'myW2Vmodel') return model
def main(): input_directory = sys.argv[1] train_size = int(sys.argv[2]) test_size = (100 - train_size) / 100 ##### Step 1: Data Loading and Basic stats ##### t0 = time() print() print('** STEP 1: Data Loading **') dl_obj = DataLoading() base_df = dl_obj.clean_data(input_directory) #prodid_ix = base_df.id.values #base_df = base_df.reindex(prodid_ix) ## This line should be removed ## #print('Only 1000 rows are loaded') #base_df = base_df.sample(10000, random_state = 123) target_matrix = dl_obj.get_multilabel(base_df) #target_matrix = target_matrix.reindex(prodid_ix) dl_obj.get_label_info(target_matrix) #### Step 2: feature Engineering ##### print() print('** STEP 2: Text Processing **') tp_obj = TextProcessing() cnt_vectorizer, feature_matrix = tp_obj.run_textprocessing(base_df) feature_matrix = pd.DataFrame(feature_matrix.toarray()) feature_matrix = feature_matrix.join( base_df[['vegetarian', 'spicy', 'garlic', 'fish']]) feature_matrix.fillna(0, inplace=True) #### Step 3: ### STEP 1: Normalize the labels ### print() print('** Filter Rare Labels combination **') util = Utility() print("Feature Matrix Shape:{} Target Matrix.shape: {}"\ .format(feature_matrix.shape, target_matrix.shape)) feature_matrix_fltrd, target_matrix_fltrd = util.filter_rare_classes( feature_matrix, target_matrix) print("Feature Matrix Shape:{} Target Matrix.shape: {}"\ .format(feature_matrix_fltrd.shape, target_matrix_fltrd.shape))# (18340,3763) ### STEP 2: Train Test Split using StratifiedShuffleSplit ##### print() print('** Train test split **') train_x, train_y, test_x, test_y = util.train_test_split( feature_matrix_fltrd, target_matrix_fltrd, test_size=test_size) print("Train_x Shape:{} \n Train_y.shape: {}"\ .format(train_x.shape, train_y.shape)) # 14672 print("Test_x Shape:{} \n Test_y.shape: {}"\ .format(test_x.shape, test_y.shape)) # 3668 ### Delete unnecssary files from memory ## ### STEP 3: Find Frequnet Itemsets on training target matrix #### print() print('** STEP 3: Frequent Itemset **') col_mapping = {} for i_col, col_name in enumerate(target_matrix.columns.tolist()): col_mapping[i_col] = col_name supp = 0.05 item_size = 3 train_y_lil = lil_matrix(train_y) frequent_items_list = util.find_frequent_itemsets(train_y_lil, col_mapping, supp, item_size) print('No of {} frequent itemsets with support {}: {} '\ .format(item_size , supp , len(frequent_items_list))) #21 itemsets freq_additives_list = [ items for itemset in frequent_items_list for items in itemset ] freq_additives_set = list( set([items for itemset in frequent_items_list for items in itemset])) freq_additives_cnt_dict = dict(Counter(freq_additives_list).items()) #del base_df,target_matrix,target_matrix_fltrd, feature_matrix, feature_matrix_fltrd #gc.collect() ### STEP 4.1: Build 21 classifiers using Naive Bayes #### print() print('** STEP 4: LabelPowerSet Classifiers**') lp = LabelPowerSet(train_x, train_y, test_x, test_y, frequent_items_list, 'nb') model_list, metrics_labels, metrics_score, prediction_list = lp.build_model_lp( ) index_value = [''.join(items) for items in frequent_items_list] metrics_labels_df = pd.DataFrame( metrics_labels, columns=['Accuracy', 'HammingLoss', 'JaccardSim'], index=index_value) metrics_score_df = pd.DataFrame( metrics_score, columns=['CoverageError', 'LblRankAvgPrec', 'LblRankLoss', 'LogLoss'], index=index_value) pickle.dump(model_list, open('LP_NB_21FSS.pkl', 'wb')) del model_list, lp metrics_labels_df.to_csv(input_directory + 'LP_NB_metrics_labels.csv') metrics_score_df.to_csv(input_directory + 'LP_NB_metrics_score.csv') ####### STEP 4.1: stack the predictions ############ final_predictions = pd.DataFrame(np.zeros( test_y[freq_additives_set].shape), columns=freq_additives_set) for i_model in range(len(prediction_list)): #i_model = 0 prediction = prediction_list[i_model] for col in prediction.columns: final_predictions[col] = final_predictions[col] + prediction[col] final_predictions_2 = final_predictions.apply( lambda x: x / freq_additives_cnt_dict[x.name]) final_predictions_2 = final_predictions_2.applymap(lambda x: 1 if x >= 0.5 else 0) print() print('** Evaluation metrics : Majority Voting**') eval_metrics = EvaluationMetrics() eval_final = eval_metrics.get_classification_report_1( test_y[freq_additives_set], final_predictions_2, verbose=1) #### STEP 5: Build Binary Relevance models #### print() print('** STEP 5 : Binary Relevance Classifiers **') br = BinaryRelevance() label_df, score_df, classifier_list = br.build_model( train_x, train_y, test_x, test_y) pickle.dump(classifier_list, open('BR_NB_classifiersList.pickle', 'wb')) print() print('** Evaluation Metrics for BR Classfiers **') eval_metrics.get_classification_report_1(test_y[label_df.columns], label_df) # Accurcay : 0.42 Hamming Loss: 0.05, Jaccard Similarity :0.62 eval_metrics.get_classification_report_2(test_y[label_df.columns], score_df) # CoverageError : 5.61, LabelRankingAvgPrec :0.83, LabelRankingLoss : 0.04, Log_loss = 6.7 ######## Binary Relevance predictions for frequent labels ##### print() print('** BR classifiers evaluation for labels in frequentitemset **') eval_metrics.get_classification_report_1(test_y[freq_additives_set], label_df[freq_additives_set]) ### STEP 6: Final Predictions ######### print() print('** STEP 6 : Final Predictions **') final_predictions_3 = pd.DataFrame(np.zeros(label_df.shape), columns=label_df.columns) ### Binary Relevance + LabelPowerset ##### for col in final_predictions_3.columns: if col in freq_additives_set: final_predictions_3[col] = final_predictions_2[col] else: final_predictions_3[col] = label_df[col] print() print('** Evaluation Metrics for Final Predcition **') print('test_ shape', test_y[label_df.columns].shape) print('final predictions', final_predictions_3.shape) eval_final_2 = eval_metrics.get_classification_report_1( test_y[label_df.columns], final_predictions_3, verbose=1) ### STEP 7: Dumping Predictions ########## print() print('** STEP 7 : Saving Predictions **') test_y.to_csv('test_actual_labels.csv') final_predictions_3.to_csv('test_final_predicted_labels.csv') score_df.to_csv('test_scoring_from_br.csv') print('Entire Process completed in {} seconds'.format(time() - t0))
from tqdm import tqdm import random from textblob import TextBlob import numpy as np from sklearn.linear_model import LogisticRegression from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, Flatten from keras.layers import Convolution2D, MaxPooling2D import torch from keras import backend as K print(K.tensorflow_backend._get_available_gpus()) logger = log.setup_custom_logger('analysis') percent_of_data = .5 # 1% of the lines sentiment_threshold = 0 text_process = TextProcessing() tqdm.pandas() def get_data(filename): #keep the header, take random rows logger.info("Reading {} percent of the data.".format(percent_of_data * 100)) data = pd.read_csv( filename, header=0, skiprows=lambda index: index > 0 and random.random() > percent_of_data) logger.info('Data Size Read : {}'.format(len(data))) logger.info('Dropping NAs') data = data.dropna()
class LDA: def __init__(self, dir='', load_dict=False): self.dir = dir self.tp = TextProcessing(dir=self.dir) # create empty dictionary: #self.dictionary = Dictionary() self.dictionary = Dictionary.load(dir + 'myDictionary') self.save_dict = True self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+') self.en_stop = get_stop_words('en') self.p_stemmer = PorterStemmer() def clean_line(self, line): raw = line.lower() tokens = self.tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in self.en_stop] # stem tokens r = [] for i in stopped_tokens: try: r.append(self.clean_word(i)) except: logging.info("Can't process word %s" % i) return r def clean_word(self, word): stemmed_word = self.p_stemmer.stem(word) return stemmed_word def load_corpus(self, file_name, num_docs): logging.info("Loading corpus in file %s" % file_name) texts = [] i = 0 for line in open(file_name, 'r'): # cleaning the line stemmed_tokens = self.tp.clean_line(line) # add tokens to list texts.append(stemmed_tokens) # count number of documents and break if > num_docs i += 1 if num_docs != -1 and i >= num_docs: break if i % 1000 == 0: logging.debug('Document %s loaded' % i) # turn our tokenized documents into a id <-> term dictionary #if len(self.dictionary) == 0: #self.dictionary = Dictionary(texts) #self.dictionary.save(self.dir + 'myDictionary') '''else: # self.dictionary.merge_with(Dictionary(texts)) pass''' # convert tokenized documents into a document-term matrix return [self.dictionary.doc2bow(text) for text in texts] def train_model(self, file_name='corpus.txt', num_docs=-1, num_topics=50, passes=20, multicore=False): # generate LDA model if not multicore: corpus = self.load_corpus(file_name, num_docs) ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=self.dictionary, passes=passes) else: corpus = Sentences(file_name, num_docs) ldamodel = LdaMulticore(corpus.__iter__(), num_topics=num_topics, id2word=self.dictionary, passes=passes, workers=3) return ldamodel def update_model(self, ldamodel, file_name, num_docs=-1): # generate new corpus corpus = self.load_corpus(file_name, num_docs) # generate LDA model ldamodel.update(corpus) def get_document_topics(self, ldamodel, text, n=1): text = self.tp.clean_line(text) bow = self.dictionary.doc2bow(text) if n == 1: return ldamodel.get_document_topics(bow, minimum_probability=0) list_d = [] keys = set() for _ in range(n): d = dict(ldamodel.get_document_topics(bow)) list_d.append(d) for k in d.keys(): keys.add(k) probs = [] for k in keys: mean = 0 for i in range(n): if k in list_d[i].keys(): mean += list_d[i][k] probs.append((k, mean / n)) return probs def show_topic_words(self, ldamodel, topic_id, topn=10): list = ldamodel.get_topic_terms(topic_id, topn=topn) r = [] for w_id, p in list: print(self.dictionary[w_id], ' \t ', p) r.append((self.dictionary[w_id], p)) return r def save_model(self, ldamodel): ldamodel.save(self.dir + 'myLDAmodel') def load_model(self): return LdaModel.load(self.dir + 'myLDAmodel')
from nltk.stem.porter import PorterStemmer # type: ignore from nltk.stem import SnowballStemmer from text_processing import TextProcessing from nltk.stem.lancaster import LancasterStemmer from nltk.corpus import stopwords # type: ignore from pathlib import Path """ I made this simple script to test the effectiveness of three popular stemming algorithms on the number of tokens returned. In increacing agressiveness: - Porter - Snowball - Lancaster """ snow = TextProcessing(stemmer=SnowballStemmer('english').stem, stop_words=stopwords.words("english")) port = TextProcessing.from_nltk() lan = TextProcessing(stemmer=LancasterStemmer().stem, stop_words=stopwords.words("english")) from utils import load_wapo data_dir = Path("pa3_data") wapo_path = data_dir.joinpath("wapo_pa3.jl") ss = set() ps = set() ls = set() for doc in list(load_wapo(wapo_path))[:200]: ss = ss.union(snow.get_normalized_tokens("", doc['content_str'])[0]) ps = ps.union(port.get_normalized_tokens("", doc['content_str'])[0]) ls = ls.union(lan.get_normalized_tokens("", doc['content_str'])[0])