def semEval(): #fname='100_topics_100_tweets.sentence-three-point.subtask-A' fname = '100_topics_100_tweets.topic-five-point.subtask-CE' #fname='100_topics_XXX_tweets.topic-two-point.subtask-BD' #train=read_semeval('texts/2download/gold/all/'+fname+'.all.gold.tsv') train = read_semeval('texts/2download/gold/train/' + fname + '.train.gold.tsv') tp = text_processing(n=4) cp = char_processing() Xw_train = tp.fit_transform(train[0]) #pos_score_train, neg_score_train=readSentiScores('texts/2download/train+.out') #add1=csr_matrix(pos_score_train).transpose() #add2=csr_matrix(neg_score_train).transpose() #Xw_train=scipy.sparse.hstack((scipy.sparse.hstack((Xw_train,add1)),add2), format='csr') Xc_train = cp.fit_transform(train[0]) X_train = scipy.sparse.hstack((Xw_train, Xc_train), format='csr') #X_train=Xw_train y_train = np.asarray(train[1]) #test=read_semeval('texts/2download/test_datasets-v2.0/SemEval2016-task4-test.subtask-BD.txt') test = read_semeval('texts/2download/gold/devtest/' + fname + '.devtest.gold.tsv') #test=read_semeval('texts/2download/gold/dev/'+fname+'.dev.gold.tsv') Xw_test = tp.transform(test[0]) #pos_score_test, neg_score_test=readSentiScores('texts/2download/test+.out') #add1=csr_matrix(pos_score_test).transpose() #add2=csr_matrix(neg_score_test).transpose() #Xw_test=scipy.sparse.hstack((scipy.sparse.hstack((Xw_test,add1)),add2), format='csr') Xc_test = cp.transform(test[0]) X_test = scipy.sparse.hstack((Xw_test, Xc_test), format='csr') #X_test=Xw_test y_test = np.asarray(test[1]) #perf=SVMperf(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test) #X_train, X_test, y_train, y_test=train_test_split(X_train,y_train, test_size=0.75) print(X_test.shape) X_test_list, y_test_list, utopics = split_by_topic(X_test, y_test, test[2]) q = Quantification(method='Iter1', is_clean=True) #X_test, y_test=Quantification.make_drift_list(X_test.toarray(), y_test, proportion=0.2) q.fit(X_train, y_train) print('train', q._classify_and_count(y_train)) prevs = q.predict_set(X_test_list, method='Iter1') write_semeval(dict(zip(utopics, prevs)), fname=fname) #forSVMperf(q,y_test,test[2],y_test_list, utopics) #q.iter_model.predict() print('CC', q.score(X_test_list, y_test_list, method='CC')) print('PCC', q.score(X_test_list, y_test_list, method='PCC')) print('EM', q.score(X_test_list, y_test_list, method='EM')) print('EM1', q.score(X_test_list, y_test_list, method='EM1')) print('Iter', q.score(X_test_list, y_test_list, method='Iter')) print('Iter1', q.score(X_test_list, y_test_list, method='Iter1')) print('ACC', q.score(X_test_list, y_test_list, method='ACC')) print('PACC', q.score(X_test_list, y_test_list, method='PACC'))
def after_semEval(): #fname='100_topics_100_tweets.sentence-three-point.subtask-A' #fname='100_topics_100_tweets.topic-five-point.subtask-CE' fname='100_topics_XXX_tweets.topic-two-point.subtask-BD' train=read_semeval('texts/2download/gold/all_train/'+fname+'.all.gold.tsv') #train=read_semeval('texts/2download/gold/all/'+fname+'.train.gold.tsv') tp=text_processing(n=1) #cp=char_processing() Xw_train=tp.fit_transform(train[0]) #pos_score_train, neg_score_train=readSentiScores('texts/2download/train+.out') #add1=csr_matrix(pos_score_train).transpose() #add2=csr_matrix(neg_score_train).transpose() #Xw_train=scipy.sparse.hstack((scipy.sparse.hstack((Xw_train,add1)),add2), format='csr') #Xc_train=cp.fit_transform(train[0]) #X_train=scipy.sparse.hstack((Xw_train, Xc_train), format='csr') X_train=Xw_train y_train=np.asarray(train[1]) #test=read_semeval('texts/2download/test_datasets-v2.0/SemEval2016-task4-test.subtask-BD.txt') test=read_semeval('texts/2download/gold/test/test_gold_2.csv') #test=read_semeval('texts/2download/gold/dev/'+fname+'.dev.gold.tsv') Xw_test=tp.transform(test[0]) #pos_score_test, neg_score_test=readSentiScores('texts/2download/test+.out') #add1=csr_matrix(pos_score_test).transpose() #add2=csr_matrix(neg_score_test).transpose() #Xw_test=scipy.sparse.hstack((scipy.sparse.hstack((Xw_test,add1)),add2), format='csr') #Xc_test=cp.transform(test[0]) #X_test=scipy.sparse.hstack((Xw_test, Xc_test), format='csr') X_test=Xw_test y_test=np.asarray(test[1]) #perf=SVMperf(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test) #X_train, X_test, y_train, y_test=train_test_split(X_train,y_train, test_size=0.75) print(X_test.shape) X_test_list, y_test_list, utopics=split_by_topic(X_test, y_test, test[2]) q=Quantification(method='Iter1',is_clean=True) #X_test, y_test=Quantification.make_drift_list(X_test.toarray(), y_test, proportion=0.2) q.fit(X_train, y_train) print('train',q._classify_and_count(y_train)) #prevs=q.predict_set(X_test_list, method='Iter1') #write_semeval(dict(zip(utopics,prevs)),fname=fname) #forSVMperf(q,y_test,test[2],y_test_list, utopics) #q.iter_model.predict() print('CC',q.score(X_test_list,y_test_list, method='CC')) print('PCC',q.score(X_test_list,y_test_list, method='PCC')) print('EM',q.score(X_test_list,y_test_list, method='EM')) print('EM1',q.score(X_test_list,y_test_list, method='EM1')) print('Iter',q.score(X_test_list,y_test_list, method='Iter')) print('Iter1',q.score(X_test_list,y_test_list, method='Iter1')) print('ACC',q.score(X_test_list,y_test_list, method='ACC')) print('PACC',q.score(X_test_list,y_test_list, method='PACC'))
def __init__(self, token): logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) self.logger = logging.getLogger("log") self.bot = Bot(token) self.updater = Updater(token) self.dispatcher = self.updater.dispatcher start_handler = CommandHandler('start', self.start) self.dispatcher.add_handler(start_handler) info_handler = CommandHandler('info', self.info) self.dispatcher.add_handler(info_handler) feedbac_handler = CommandHandler('feedback', self.feedback) self.dispatcher.add_handler(feedbac_handler) insertion_handler = CommandHandler('insert', self.insert) self.dispatcher.add_handler(insertion_handler) self.dispatcher.add_handler(CallbackQueryHandler(self.button_clicked)) text_message_handler = MessageHandler(Filters.text, self.text_message) self.dispatcher.add_handler(text_message_handler) document_message_handler = MessageHandler(Filters.document, self.document_message) self.dispatcher.add_handler(document_message_handler) self.dispatcher.add_handler(document_message_handler) self.dispatcher.add_error_handler(self.error) self.ner = Ner_Babel() self.prediction = Prediction() self.kb_interface = KB_interface() self.text_processing = text_processing() #O dicionario contem dicionarios que representam que tem o mesmo nome dos # metodos que requisitam precionar de butão pelo usuario, esses dict possuem as # variaveis uteis para lidar com essa ação. # O callback_query do botao será então a identidicação desse estado e botão precionado self.global_variables = { "last_question": "", "last_answer": "", "main_entity": "", "related_entity": "", "ambiguos_entities": [], "sugests_topics": { "buttons": [] }, "ask_for_answer_evaluation": { "buttons": ["P", "N"] }, "sugests_question": { "buttons": [] } }
def endElement(self, tag): # print(tag + " ==== ends ") global i_count if self.doc_running == False: return if tag == "revision": self.revision = False elif tag == "contributor": self.contributor = False elif tag == "page": self.doc_running = False self.doc.category = ' '.join( re.findall(catRegExp, self.doc.text, flags=re.MULTILINE)) self.doc.infobox = ' '.join( re.findall(infoRegExp, self.doc.text, re.DOTALL)) self.doc.ref = ' '.join( re.findall(refRegExp, self.doc.text, flags=re.DOTALL)) self.doc.text = remove_extra(self.doc.text) docid_title_map[int(self.doc.id)] = self.doc.title self.doc.text = text_processing(self.doc.text) self.doc.title = text_processing(self.doc.title) self.doc.comment = text_processing(self.doc.comment) self.doc.category = text_processing(self.doc.category) self.doc.infobox = text_processing(self.doc.infobox) self.doc.ref = text_processing(self.doc.ref) # ============================================================================= # try : # if self.doc.id : # self.doc.id = int(self.doc.id) # if self.doc.r_id : # self.doc.r_id = int(self.doc.r_id) # if self.doc.con_id : # self.doc.con_id = int(self.doc.con_id) # except : # print(self.doc.id) # print(self.doc.r_id) # print(self.doc.con_id) # ============================================================================= doc_list.append(self.doc) if len(doc_list) == doc_chunk_size: i_count += 1 fname = create_inverted_index(doc_list, i_count, index_folder_path) filenames.append(fname) # i_count += 1 # path_to_save = "../index/i_index" + str(i_count) + ".txt" # write_index_to_file(path_to_save, i_index) doc_list.clear() self.doc = ""
from keras.preprocessing import text, sequence import pickle from text_processing import text_processing import os train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') max_features = 150000 maxlen = 600 embed_size = 300 train["comment_text"].fillna("fillna") test["comment_text"].fillna("fillna") text_processer = text_processing() train["clean_comment"] = text_processer.remove_stopwords(train["comment_text"]) test["clean_comment"] = text_processer.remove_stopwords(test["comment_text"]) train["clean_comment"] = train["clean_comment"].apply( lambda x: text_processer.clean_text(x)) test["clean_comment"] = test["clean_comment"].apply( lambda x: text_processer.clean_text(x)) train["clean_comment"] = train["clean_comment"].apply( lambda x: text_processer.glove_preprocess(x)) test["clean_comment"] = test["clean_comment"].apply( lambda x: text_processer.glove_preprocess(x)) list_sentences_train = train["clean_comment"] list_sentences_test = test["clean_comment"] tokenizer = text.Tokenizer(num_words=max_features)
return total total_files = len(os.listdir(testing_data)) count_file = 1 for file in os.listdir(testing_data): print("File " + str(count_file) + " out of " + str(total_files)) count_file += 1 file_path = testing_data + "\\" + file f = open(file_path, 'r') text_body = f.read() token_list = text_processing.text_processing(text_body) best_score = -inf best_class = None for name in list_of_classes: score = class_map_calculation(token_list, name) if score > best_score: best_score = score best_class = name result_dict[file] = best_class with open("saved_result_dict.json", 'w') as f: json.dump(result_dict, f)
from text_processing import text_processing from paperAlgo2 import algo2 import numpy t = text_processing("feature1_svd", "feature2", "clustering_data", "attraction_mapping", "attraction_information", "feature2_keywords") M = t.num_users def algo1(lx, lu, su, D): converged = False N = 250 allU = numpy.zeros((M, D, D)) T = numpy.zeros(M) for m in range(0,M): actual = 0 total = 0 for cluster in t.get_user_clusters(m): size = len(cluster) actual = actual + size*(size-1)/2 total = total+size T[m] = (actual*1.)/(total*1.) B = numpy.zeros((M,N,N)) y = numpy.zeros((M,N,N)) for m in range(0,M): clusters = t.get_user_clusters(m) for cluster in clusters: for i in cluster: for j in range(0,N): y[m,i,j] = 1 if j in cluster else -1 B[m,i,j] = 1 if j in cluster else T[m]
probability_class_dict[folder] = ( file_total / 16810) # Get total file count through explorer for file in os.listdir(folder_path): print("File ", str(file_count) + " out of " + str(file_total), end="; ") print("Folder ", str(folder_count) + " out of " + str(folder_total)) file_count += 1 file_path = folder_path + "\\" + file f = open(file_path, 'r') text_body = f.read() token_list = text_processing.text_processing( text_body) # Token list for all tokens in a file for token in token_list: if token not in class_dict: class_dict[token] = 1 else: class_dict[token] += 1 end_time = time.time() print("Training time: ", end_time - start_time) with open("saved_master_dict.json", 'w') as f: json.dump(master_dict, f) f.close() with open("saved_probability_class_dict.json", 'w') as f:
field = w d[field] = "" else: try: d[field] += w + ' ' except: d["all"] += w + ' ' return d #queries = f.readlines() while True: print("enter query") query = input() start_time = time() q = check_fields(query) for key in q.keys(): q[key] = text_processing(q[key]) doc_ids = list(query_processing(q)) if (len(doc_ids)) == 0: print("No Results found") else: for i in doc_ids[0:10]: print(doc_title_map[i], end="\r") print() print("time taken: %.2f\n" % (time() - start_time))