def kfcvkNN(mi, k=10): correct = [] tested = [] tot = 0 cor = 0 for i in range(k): #mark_test_set(mi, k, i) MessageFeatures.test_fold = i MessageFeatures.folds = k tf = TFIDF(mi, 3) tf.train1() tf.correct = 0 tf.correct = 0 c = 0 t = 0 for m in mi: if m.isTest(mi.num_msgs): cl = tf.get_class_kNN(m) #print(cl) if cl == m.newsgroupnum: c+=1 t+=1 print(tf.correct) correct.append(c) tested.append(t) tot+=t cor+=c print (1.0*cor/tot) pass
def pl_preprocessing(total_pl): train_data = [] train_y = [] NUM_PL = 8 D_WORD = 300 tf_idf = TFIDF(total_pl) pl_cnt, words = tf_idf.get_tfidf() # label l = 0 for field in total_pl: # print(field) for num, j in enumerate(field): m = get_pl_v(j, pl_cnt, NUM_PL, D_WORD) if len(m) == 2400: train_data.append(m) train_y.append(l) else: pass l += 1 # print(i) # print(t,s) return train_data, train_y
def testBaseFC(seedUrls, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf, options)
def get_keywords(self, pageText, count): mytfidf = TFIDF() tokenPageText = getTokenizedDocs([pageText]) token_bow = [mytfidf.doc2bow(doc) for doc in tokenPageText] mytfidf.buildVocabIndex(token_bow) selected = mytfidf.selectImportantWords_tf(count) wordsList = mytfidf.index.keys() selected_words = [wordsList[k[1]] for k in selected] return selected_words
def __init__(self, file_name): """Creates a search engine backed by PageRank and TF-IDF Args: file_name: path to xml files of wiki dump """ # build corpus from xml files self.corpus, self.links = build_corpus(file_name) self.tf_idf = TFIDF(self.corpus) print("TFIDF engine has started") self.reverse_index = {word: set(mapping.keys()) for word, mapping in self.tf_idf.tf_idf.items()} self.page_rank = PageRank(self.links, self.tf_idf.tf_idf) print("PageRank engine has started")
def ask_question(qs_input, top_k): """ Ask one question and generate response for tfidf, lm and cnn """ print("Question : %s" % qs_input) print("Top k : : %d" % top_k) random.seed(12345) retrieval_data_start_time = time.clock() questions, pred_questions, answers, pred_answers = Data.read_pred_data( "Data/pred_QA-pair.csv") # Build word --> sentence dictionary word_sentence_dict = Data.generate_word_sentence_dict(pred_questions) print("Retrieval Data Finished") retrieval_data_end_time = time.clock() print("Retrieval Data cost %f" % (retrieval_data_end_time - retrieval_data_start_time)) response_start_time = time.clock() lm = LM(questions, pred_questions, answers, pred_answers, word_sentence_dict) tfidf = TFIDF(questions, pred_questions, answers, pred_answers, word_sentence_dict) cnn = CNN(questions, pred_questions, answers, pred_answers, word_sentence_dict, isTrain=False) _, lm_response = lm.ask_response(qs_input, top_k=top_k) tfidf_response_id, tfidf_response = tfidf.ask_response(qs_input, top_k=top_k * 10) cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id) for i in range(top_k): print("LM response %d: %s" % (i + 1, lm_response[i])) for i in range(top_k): print("TFIDF response %d: %s" % (i + 1, tfidf_response[i])) for i in range(top_k): print("CNN response %d: %s" % (i + 1, cnn_response[i])) print("Response Finished") response_end_time = time.clock() print("Response cost %f" % (response_end_time - response_start_time))
def baseFC_OneTargetVector(crawlParams): seedURLs = crawlParams['seedURLs'] t = [(-1, p, -1, "") for p in seedURLs] priorityQueue = PriorityQueue(t) crawlParams["priorityQueue"] = priorityQueue mytfidf = TFIDF() mytfidf.buildModel(crawlParams['model'], crawlParams['No_Keywords']) #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords']) crawlParams['scorer'] = mytfidf #crawler = Crawler(priorityQueue,scorer,options) crawler = Crawler(crawlParams) crawler.crawl() return crawler.relevantPages
def __init__(self, modelInstance): self.model = modelInstance features = [ cosine_similarity.CosineSimilarity(), n_gram_matching.NGramMatching(), sentiment_feature.SentimentFeature(), SVD.SVD(), TFIDF.TFIDF(), baseline_features.BaselineFeature(), cue_words.CueWords() ] self.features_train = np.hstack( [feature.read() for feature in features]) self.labels_train = DataSet(path="../FNC-1").get_labels() self.features_test = np.hstack( [feature.read('competition_test') for feature in features]) self.labels_test = DataSet(path="../FNC-1", name="competition_test").get_labels()
def get_similarity(self, matrix=None, langue=None): if langue == None: self.set_sparse_matrix(matrix) else: tfidf = TFIDF(matrix, langue) sparse_matrix = tfidf.get_sparse_matrix() self.set_sparse_matrix(sparse_matrix) if self.dr == False: dimensionality_reduction = int(round((len(matrix[0]) * 3 / 4))) self.set_dimensionality_reduction(dimensionality_reduction) permutation_matrix = self._get_permutation_matrix() signature_matrix = self._get_signature_matrix(permutation_matrix) similarity_matrix = self._get_similarity_matrix(signature_matrix) return similarity_matrix
def get_dominating_words(context_dict, corpusdir): tfidf = TFIDF(corpusdir) dominating = init_dict() cache = dict() for t in context_dict.keys(): contexts = context_dict[t] for c in contexts: curr_max = (None, -1) for tok in c: if tok == "-ENT-": break if not cache.has_key(tok): cache[tok] = tfidf.idf(tok) if cache[tok] > curr_max[1]: curr_max = (tok, cache[tok]) if curr_max[0] != None: dominating[t].append(curr_max[0]) return dominating
def main(): #########Input and Output########## #IMPORTANT! lilypath = 'lily' #IMPORTANT! Set your own lily path and stopWords stopWordspath = 'Chinese-stop-words.txt' #IMPORTANT! stopWords = codecs.open(stopWordspath, 'r', 'gbk') inputfile = {} outputfile = {} filenames = os.listdir(lilypath) cnt = 0 for filename in filenames: inputfile[cnt] = codecs.open(lilypath + '/' + filename, 'r', 'utf-8') outputfile[cnt] = open(filename, 'w+') cnt += 1 #############TFIDF############# TFIDF(inputfile, outputfile, stopWords, cnt) #The TFIDF algorithem for i in range(0, cnt): inputfile[i].close() outputfile[i].close() stopWords.close()
def testEventFC(seedUrls, pLimit, eventTree): #print 'GIVEN TREE:' #print eventTree # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT | os.O_RDWR) os.write(f, seedUrls) os.close(f) # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = os.open(eventFile, os.O_CREAT | os.O_RDWR) os.write(fw, eventTree.lower()) os.close(fw) mytfidf = TFIDF() # appears to work fine (called then exited) myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } cleandocs = getTokenizedDocs(docs) #print 'cleandocs' #print cleandocs mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def tfidf(mi): MessageFeatures.test_fold = -1 tf = TFIDF(mi, 3) tf.train1() cj = 0 cj_count = 0 tf.correct = 0 cj = 0 cj_count = 0 tf.correct = 0 for m in mi: if cj_count >= 20: cj_count = 0 cj += 1 elif m.newsgroupnum == cj: cj_count += 1 c = tf.get_class_kNN(m) print(c) print(tf.correct) pass
def testEventFC(seedFile, pLimit): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def testBaseFC(seedUrls, pLimit): mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = os.open(seedFile, os.O_CREAT | os.O_RDWR) os.write(f, seedUrls) os.close(f) docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.4 urlScoreThreshold = 0.4 options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) #eventFC(myEventScorer, mytfidf, options) baseFC(mytfidf, options)
def part1(documentPath, maximumDocuments=0): startTime = time.time() print("Executing code for Part 1...\n") print("Extracting data from XML Document...") values = XMLParse(documentPath, maximumDocuments) print("Number of Documents: "+str(len(values))) extractionTime = round(time.time() - startTime, 3) print("Time: " + str(extractionTime) + " seconds") print("Removing stopwords and stemming...") for i in range(len(values)-1, -1, -1): if values[i].hasField('BODY'): values[i].setField('BODY',removeStopwords(values[i].getField("BODY"))) else: del values[i] removingTime = round(time.time() - startTime - extractionTime, 3) print("Time: " + str(removingTime) + " seconds") print("Creating list of all unique words in corpus...") uniqueWords = getUniqueWords(values) uniqueWordsTime = round(time.time() - startTime - extractionTime - removingTime, 3) print("Time: " + str(uniqueWordsTime) + " seconds") print("Computing TF, IDF, and TFIDF...") computedTFIDF = TFIDF(values, uniqueWords) idfTime = round(time.time() - startTime - extractionTime - removingTime - uniqueWordsTime, 3) print("Time: " + str(idfTime) + " seconds") print("Computing Cosine Similarity...") computedTFIDF.calculateCosineSimilarity() #computedTFIDF.printVal('sim', 19) cosineSimTime = round(time.time() - startTime - extractionTime - removingTime - uniqueWordsTime - idfTime, 3) print("Time: " + str(cosineSimTime) + " seconds") print('\nPart 1 Complete') print("Execution Time: " + str(round(time.time() - startTime, 3)) + " seconds\n") return computedTFIDF
def testEventFC(seedUrls, pLimit, eventTree): print 'Content-Type: text/plain\n\n' mytfidf = TFIDF() myEventScorer = EventScorer.EventScorer() # Write the seedUrls to a file seedFile = 'addurls.txt' if os.path.isfile(seedFile): os.remove(seedFile) f = open(seedFile, "rw") f.write(seedUrls) f.close() # Write the Event Tree to file eventFile = 'event-details.txt' if os.path.isfile(eventFile): os.remove(eventFile) fw = open(eventFile, "rw") fw.write(eventTree) fw.close() docs = downloadRawDocs(seedFile) seedURLs = getSeedURLs(seedFile) print seedURLs pagesLimit = pLimit pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 # set threshold so scorer knows when to print tree to file myEventScorer.set_threshold(pageScoreThreshold) options = { "num_pages": pagesLimit, "pageScoreThreshold": pageScoreThreshold, "urlScoreThreshold": urlScoreThreshold, "seeds": seedURLs } cleandocs = getTokenizedDocs(docs) mytfidf.buildModel(cleandocs) eventFC(myEventScorer, mytfidf, options)
def test(): mytfidf = TFIDF() docs = downloadRawDocs("typhoon_haiyan_SEED_URLs.txt") seedURLs = getSeedURLs("typhoon_haiyan_SEED_URLs.txt") pagesLimit = 1000 pageScoreThreshold = 0.5 urlScoreThreshold = 0.4 options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs} #print urls_tokens #print title_tokens cleandocs = getTokenizedDocs(docs) pos = cleandocs #print len(pos) #print len(neg) #print pos mytfidf.buildModel(pos) #mytfidf.buildModel(cleandocs) #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens) baseFC(mytfidf,options)
def get_Dnn_model(total_pl): NUM_PL = 8 D_WORD = 300 tf_idf = TFIDF(total_pl) pl_cnt, words = tf_idf.get_tfidf() x, y = pl_preprocessing(total_pl, NUM_PL) x = np.array(x) y = np.array(y) ###### test 同 training data ####### X_train, X_test1, Y_train, y_test1 = train_test_split(x, y, test_size=0.2) data = CrossValidationFolds(X_train, Y_train, FOLDS) (X_train1, y_train1), (X_valid1, y_valid1) = data.split() ###### test 不同 training data ####### # data = CrossValidationFolds(x, y, FOLDS) # (X_train1, y_train1), (X_valid1, y_valid1) = data.split() # X_test1,y_test1 = load_pl('../new_Steeve_data/filter_Dice/can/') # X_test1 = np.array(X_test1) # y_test1 = np.array(y_test1) ##### testing data ###### # Tx = X_test1[0] # Ty = y_test1[0] # Tx = Tx.reshape([1,-1]) # print(Tx.shape) # print(X_test1.shape) ### 先前設置 FOLDS = 5 in_units = D_WORD * NUM_PL n_class = 6 # 題目要求只要辨識 0 ,1 ,2 ,3 及4 ,共5個類別 n_train = len(X_train1) # train資料的長度 batch_size = 50 n_batch = n_train // batch_size X = tf.placeholder(tf.float32, [None, in_units], name="X") # 初始化x資料型態為[None,784] y = tf.placeholder(tf.int64, shape=(None), name="y") # 初始化y資料型態[None] logits = L_layers_model(X, 128, n_class, 0.5) Y_proba = tf.nn.softmax(logits, name="Y_proba") loss, train_op = train_op(y, logits) accuracy, precision, recall = acc_model(y, logits) prediction = tf.argmax(Y_proba, 1) saver = tf.train.Saver() # call save function config = tf.ConfigProto(device_count={'GPU': 1}) #指定gpu # Params for Train epochs = 1000 # 10 for augmented training data, 20 for training data val_step = 100 # 當 50 步時去算一次驗證資料的正確率 # Training cycle max_acc = 0. # Save the maximum accuracy value for validation data early_stop_limit = 0 # 紀錄early_stop的值 init = tf.global_variables_initializer() init_l = tf.local_variables_initializer() with tf.Session(config=config) as sess: run(sess, X_train1, y_train1, X_valid1, y_valid1) sess.run(init_l) saver.restore(sess, '../dnn_model.ckpt') # 開啟剛剛 early_stop 的 model print('Acc_test :', sess.run(accuracy, feed_dict={ X: X_test1, y: y_test1 })) print('Prec_value :', sess.run(precision, feed_dict={ X: X_test1, y: y_test1 })) print('Recall_value :', sess.run(recall, feed_dict={ X: X_test1, y: y_test1 }))
def cnn_output(input_file_name, output_file_name, output_num, top_k): """ Generate cnn outputs """ random.seed(12345) retrieval_data_start_time = time.clock() questions, pred_questions, answers, pred_answers = Data.read_pred_data( input_file_name) # Build word --> sentence dictionary word_sentence_dict = Data.generate_word_sentence_dict(pred_questions) print("Retrieval Data Finished") retrieval_data_end_time = time.clock() print("Retrieval Data cost %f" % (retrieval_data_end_time - retrieval_data_start_time)) cnn_response_start_time = time.clock() tfidf = TFIDF(questions, pred_questions, answers, pred_answers, word_sentence_dict) cnn = CNN(questions, pred_questions, answers, pred_answers, word_sentence_dict, isTrain=False) if output_file_name.split(".")[-1] == "txt": output = open(output_file_name, "w") for i in range(output_num): qs_index = int(random.random() * len(questions)) qs_input = questions[qs_index].encode("utf-8") output.write("Question : %s\n" % qs_input) tfidf_response_id, tfidf_response = tfidf.ask_response( qs_input, top_k * 10) cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id) for i in range(top_k): output.write("CNN response %d: %s\n" % (i + 1, cnn_response[i].encode("utf-8"))) output.write("\n") output.close() cnn_response_end_time = time.clock() print("CNN response cost %f" % (cnn_response_end_time - cnn_response_start_time)) if output_file_name.split(".")[-1] == "csv": with open( output_file_name, 'w', ) as csvfile: fieldnames = ['Question'] fieldnames.extend(["Reply " + str(i + 1) for i in range(top_k)]) fieldnames.append("Score") writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i in range(output_num): dict = {"Score": ""} qs_index = int(random.random() * len(questions)) qs_input = questions[qs_index].encode("utf-8") dict["Question"] = qs_input tfidf_response_id, tfidf_response = tfidf.ask_response( qs_input, top_k * 10) cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id) for i in range(min(top_k, len(cnn_response))): dict["Reply " + str(i + 1)] = cnn_response[i].encode("utf-8") writer.writerow(dict)
def main(): command = sys.argv[1] no = int(sys.argv[2]) if command == "CP": if no == 1: tensor = MovieTensor(1) tensor.getTensor() elif no == 2: tensor = MovieTensor(2) tensor.getTensor() elif command == "SVD": allactormoviesdata = pandas.read_csv("movie-actor.csv") alltagsdata = pandas.read_csv("mltags.csv") allactormoviesdata['max_actor_rank'] = allactormoviesdata.groupby( ['movieid'])['actor_movie_rank'].transform(max) allactormoviesdata['min_actor_rank'] = allactormoviesdata.groupby( ['movieid'])['actor_movie_rank'].transform(min) allactormoviesdata['actor_rank_weightage'] = allactormoviesdata.apply( compute_actor_weightage, axis=1) # # print(allactormoviesdata) min_timestamp = pandas.to_datetime(min(alltagsdata['timestamp'])) max_timestamp = pandas.to_datetime(max(alltagsdata['timestamp'])) alltagsdata['timestamp_weightage'] = alltagsdata.apply( CalculateTimestampWeights, axis=1, args=(min_timestamp, max_timestamp)) mergeddata = allactormoviesdata[[ 'actorid', 'movieid', 'actor_rank_weightage' ]].merge(alltagsdata[['movieid', 'tagid', 'timestamp_weightage']], on='movieid') #print(mergeddata[mergeddata['actorid'].isin([878356,1860883,316365,128645])]) mergeddata['total_weightage'] = mergeddata.apply( aggregate_tf_weightages, axis=1) mergeddata['tag_weightage'] = mergeddata.groupby( ['actorid', 'tagid'])['total_weightage'].transform('sum') tfdata = mergeddata[['actorid', 'tagid', 'tag_weightage' ]].drop_duplicates(subset=['tagid', 'actorid']) tfdata['total_weightage_actor'] = tfdata.groupby( ['actorid'])['tag_weightage'].transform('sum') tfdata['tf'] = tfdata.apply(ComputeTF, axis=1) taglist = tfdata['tagid'].tolist() alltagsdata = pandas.read_csv("mltags.csv") alltagsdata = alltagsdata[alltagsdata['tagid'].isin(taglist)] #print(alltagsdata) allactormoviesdata = pandas.read_csv("movie-actor.csv") requiredtagsdata = alltagsdata.merge(allactormoviesdata, on='movieid') requiredtagsdata.drop_duplicates(subset=['tagid', 'actorid'], inplace=True) requiredtagsdata['actor_count'] = requiredtagsdata.groupby( 'tagid')['actorid'].transform('count') requiredtagsdata.drop_duplicates(subset=['tagid'], inplace=True) actordata = pandas.read_csv("imdb-actor-info.csv") total_actors = actordata.shape[0] requiredtagsdata['idf'] = requiredtagsdata.apply( ComputeIDF, axis=1, total_actors=total_actors) # # print(total_actors) # print(requiredtagsdata) tfidfdata = ProcessTFandIDFtoTFIDF(tfdata, requiredtagsdata[['tagid', 'idf']]) # print(tfdata) #tfidfdata = tfidfdata[tfidfdata['actorid'].isin([878356,1860883,316365,128645])] #print(tfidfdata) actor_tag_matrix = tfidfdata.pivot_table(index='actorid', columns='tagid', values='tfidf', fill_value=0) print "Actor Tag Matrix" print actor_tag_matrix tf = TFIDF("", 1, "_actor_") tf.calcMoviesVector()
corpus = [] for text in new_df['content']: corpus.append(text) titles = [] for title in new_df["title"]: titles.append(str(title)) #labels_df starts at df[5000] so we're good on the matching of labels to content events = [] for event in labels_df["Event"][:1000]: events.append(str(event)) from TFIDF import TFIDF #creates TFIDF matrix TFIDF(corpus) ############################################################################## ###################KMEANS##################################################### ############################################################################## from sklearn.externals import joblib #Loads my pre-existing kmeans model #Saves the model you just made #joblib.dump(km, '700_No_Ngram.pkl') km = joblib.load("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/KMeans_Cluster_Models/350_no_Ngram.pkl") clusters = km.labels_.tolist() #Only to create a new kmeans model from sklearn.cluster import KMeans
faq = pd.read_csv('../data/interim/faq-text-separated.csv', keep_default_na=False) test_questions = pd.read_csv('../data/test/test-questions.csv') features = ['Topic', 'Category', 'Department', 'question', 'answer'] test_topics = pd.read_excel( '../../../Inquire Boulder request data- detailed open and closed - for research purposes.xlsx' ) test_topics = test_topics[['Description', 'Topic']] test_topics = test_topics.rename(index=str, columns={ "Description": "test_question", "Topic": "match_topic" }) # # Evaluate KDTree on questions # kdtree = KDTREE(faq, features, 'KDTREE') # kdtree.evaluate(test_questions, 'questions') # # Evaluate Word2Vec on questions # w2v = W2V(faq, features, 'W2V') # w2v.evaluate(test_questions, 'questions') # w2v.evaluate(test_topics, 'topics') # Evaluate TFIDF on questions and Topics tfidf = TFIDF(faq, features, 'TFIDF') tfidf.evaluate(test_questions, 'questions') # tfidf.evaluate(test_topics, 'topics')
from ExtractAbstract import ExtractAbstract from InformationContent import InformationContent from TFIDF import TFIDF as TFIDF from ClusterRelatedness import ClusterRelatedness from DimensionRelatedness import DimensionRelatedness from RelatednessGraph import RelatednessGraph if __name__ == "__main__": ''' PART 1 - 1 Calculate IC and TFIDF ''' IC = InformationContent("./source/ic.txt") #DEBUG: IC.printSortedList() TfIdf = TFIDF("./source/tfidf.txt") #DEBUG: TfIdf.printSortedList() ''' PART 1 - 2 Use IC and TFIDF to extract words from abstracts ''' Extractor = ExtractAbstract("./source/corpus5.csv", IC, TfIdf, 0.35, 0.3) #IC THReshold / TFIDF Threst ''' PART 2 Calculate Relatedness ''' # Finding Relatedness 1 - Find Vector Cluster # ClusterRelatedness = ClusterRelatedness("./source/vectors.txt", "./abstracts/", Extractor.fileNum) # Finding Relatedness 2 - Compare Word Pairs
def __init__(self, model): self.model = model self.db = DBConnect() self.tfIdf = TFIDF("", "", "_actor_")
def get_tfidf(): total_data = get_raw_pl() tf_idf = TFIDF(total_data) tfidf_scores, words = tf_idf.get_tfidf() return tfidf_scores
corpus = [] for text in new_df['content']: corpus.append(text) titles = [] for title in new_df["title"]: titles.append(str(title)) #labels_df starts at df[5000] so we're good on the matching of labels to content events = [] for event in labels_df["Event"][:1000]: events.append(str(event)) import os os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Pre-Processing") from TFIDF import TFIDF tfidf_matrix = TFIDF(corpus) #################################################################### ##########################HAC####################################### #################################################################### from sklearn.cluster import AgglomerativeClustering from scipy.cluster.hierarchy import dendrogram hac = AgglomerativeClustering(n_clusters=500, affinity = "euclidean") dense_matrix = tfidf_matrix.todense() hac.fit_predict(dense_matrix) from sklearn.externals import joblib #Saves the model you just made joblib.dump(hac, '350_euc_HAC.pkl')