def predict(): preproses() td = TFIDF([xdata, ydata]) clasification = [] # Receives the input query from form if request.method == 'POST': namequery = request.form['namequery'] spliter = namequery.split(',') for row in spliter: clasification.append(testFromTrained([td.transform(row)])) print(clasification) keras.clear_session() labels, values = np.unique(clasification, return_counts=True) lbls, vals = np.unique(clasification, return_counts=True) pie_labels = labels pie_values = values colors = ["#F7464A", "#46BFBD"] return render_template('hasil.html', set=zip(values, labels, colors), clasification=zip(spliter, clasification), legenda=zip(lbls, vals))
def wordcount(filename, ent_file, tfidf, text, id): resources = open(filename) resources.readline() # header wordcount = TFIDF(get_entities(ent_file)) for id, lines in groupby(csv.reader(resources), id): maintext = ' '.join(text(line).lower() for line in lines) wordcount.process(maintext) wordcount.done() out = open(tfidf, 'w') for word, _, _, tfidf in wordcount.highest(200): out.write('%s\t%f\n' % (word, tfidf))
def preprocess(self, filepath): dataset = pd.read_csv(filepath, delimiter=',') self.xData = [] self.yData = [] for k in dataset['Kalimat']: self.xData.append(k) for k in dataset['Formalitas']: self.yData.append(k) self.tfidf_data = TFIDF([self.xData, self.yData])
def test_tfidf(self): """ Test the TF-IDF scheme. """ idf = {'a': 2, 'b': 1, 'c': 1} tokens = ['a', 'b', 'b', 'c', 'd'] tfidf = TFIDF(idf, 3) document = tfidf.create(tokens) self.assertEqual(0, document.dimensions['a']) self.assertEqual(0.35218, round(document.dimensions['b'], 5)) self.assertEqual(0.17609, round(document.dimensions['c'], 5)) self.assertEqual(0.47712, round(document.dimensions['d'], 5))
def parseQuery(self, query, invIndex): #Both handlers return the respective TF_IDFs #docTF_IDF can be run once after crawl tfidf = TFIDF() # print invIndex docTF_IDF = tfidf.docHandler(invIndex, 0) # print docTF_IDF queryTF_IDF = self.queryHandler(query, invIndex) if queryTF_IDF == -1: print "No words from your search were found in any documents...Please try new search terms!" return -1 cosSimByDoc = self.cosSimilarityHandler(docTF_IDF, queryTF_IDF) # print "Cosine Similarity by document:", cosSimByDoc return cosSimByDoc
def __train_models(self): # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model. self.ngp = NGramPerplexity() self.tfidf = TFIDF() print("Training models from specific corpora") for file in os.listdir(self.input_dir): print("Training models from specific corpora: " + file) with open(self.input_dir + "/" + file, encoding="utf-8") as input: for line in input: words = WordExtractor.get_words(line) if len(words) == 0: continue self.sentences.append(words) self.ngp.train_from_text(words) self.tfidf.train_from_text(words)
def test_export(self): """ Test exporting and importing the IDF table. """ idf = {'a': 2, 'b': 1, 'c': 1} tfidf = TFIDF(idf, 3) e = tfidf.to_array() self.assertEqual(tfidf.global_scheme.documents, TFIDF.from_array(e).global_scheme.documents) self.assertEqual(tfidf.global_scheme.idf, TFIDF.from_array(e).global_scheme.idf) self.assertEqual(tfidf.local_scheme.__dict__, TFIDF.from_array(e).local_scheme.__dict__) self.assertEqual(tfidf.global_scheme.__dict__, TFIDF.from_array(e).global_scheme.__dict__)
def preproses(self, filepath): f = open(filepath) # split new line sents = f.read().split('\n') # shuffle all sentences order shuffle(sents) # on each sentence # - split by semicolon # - append to variable for sent in sents: temp = sent.split(';') if len(temp) == 2: self.xdata.append(temp[0]) self.ydata.append([int(temp[1])]) # prepare tfidf feature self.tfidf_data = TFIDF([self.xdata, self.ydata])
def count(district, type='essays', extract_text=lambda line: ' '.join(line[3:10]), id=lambda line: line[0]): (_projectid, _teacher_acctid, _schoolid, school_ncesid, school_latitude, school_longitude, school_city, school_state, school_zip, school_metro, school_district, school_county, school_charter, school_magnet, school_year_round, school_nlns, school_kipp, school_charter_ready_promise, teacher_prefix, teacher_teach_for_america, teacher_ny_teaching_fellow, primary_focus_subject, primary_focus_area, secondary_focus_subject, secondary_focus_area, resource_usage, resource_type, poverty_level, grade_level, vendor_shipping_charges, sales_tax, payment_processing_charges, fulfillment_labor_materials, total_price_excluding_optional_support, total_price_including_optional_support, students_reached, used_by_future_students, total_donations, num_donors, eligible_double_your_impact_match, eligible_almost_home_match, funding_status, date_posted, date_completed, date_thank_you_packet_mailed, date_expiration) = range(46) proj_ids = [] projects = open('../data/projects.%scsv' % district) projects.readline().strip() # header for proj in csv.reader(projects): if proj[date_posted].startswith('2011'): proj_ids.append(proj[0]) proj_ids = frozenset(proj_ids) projects.close() wordcount = TFIDF(get_entities(ent_file)) essays = open('../data/%s.%scsv' % (type, district)) essays.readline() # header for proid, lines in groupby(csv.reader(essays), id): if proid in proj_ids: text = ' '.join(extract_text(line) for line in lines).lower() wordcount.process(text) wordcount.done() essays.close() out = open('../data/wc_%s%scsv' % (type, district), 'w') for word, tf, df, tfidf in wordcount.highest(0): out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))
def getRecommendation(new_df, record): temp_df = new_df[['id','name', 'album', 'artist', 'release_date']] temp_df = pd.concat([temp_df, record], ignore_index = True) col = ['name', 'album', 'artist', 'release_date'] data = pd.DataFrame(columns=col) id = [] for i in col: yield "<br/>" tf = TFIDF(temp_df, i) cosine_sim = linear_kernel(tf, tf) data[i] = cosine_sim[-1] d1 = data.sort_values(by=[i], ascending=False) id.append(list(d1.head(7).index)) tid = [] for i in range(4): track_id = [] for j in id[i]: track_id.append(temp_df.iloc[j, 0]) tid.append(track_id) return tid
def upload_file(): if request.method == 'POST': if 'file' not in request.files: flash('Not file part') # return redirect(request.url) file = request.files['file'] if file.filename == '': flask('not select file') # return redirect(request.url) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # return redirect(url_for('upload_file', filename=filename)) print(filename) fold = "data/" + filename print(fold) with open(fold, 'r') as csv_par: preproses() td = TFIDF([xdata, ydata]) clasification = [] csv_reader = csv_par.read().split('\n') for row in csv_reader: clasification.append(testFromTrained([td.transform(row)])) keras.clear_session() labels, values = np.unique(clasification, return_counts=True) lbls, vals = np.unique(clasification, return_counts=True) pie_labels = labels pie_values = values colors = ["#F7464A", "#46BFBD"] return render_template('hasil.html', set=zip(values, labels, colors), clasification=zip(csv_reader, clasification), legenda=zip(lbls, vals))
def parsing(): with open('data/test.csv', 'r') as csv_par: preproses() td = TFIDF([xdata, ydata]) rowdata = [] clasification = [] csv_reader = csv_par.read().split('\n') for row in csv_reader: rowdata.append(row) clasification.append(testFromTrained([td.transform(row)])) keras.clear_session() labels, values = np.unique(clasification, return_counts=True) lbls, vals = np.unique(clasification, return_counts=True) pie_labels = labels pie_values = values colors = ["#F7464A", "#46BFBD"] return render_template('hasil.html', set=zip(values, labels, colors), clasification=zip(csv_reader, clasification), legenda=zip(lbls, vals))
def start(tfidf_threshold): #initialize TFIDF phrase_file = open("text_segmented_by_phrase.txt", "r") for line in phrase_file: index, text = line.split("##") token_list = text.lower().strip().split("!!") id_phrases[index] = token_list phrase_file.close() tfidf = TFIDF(id_phrases.values()) print("TFIDF initialized") input_file = open("publications.txt") #input_file = open("pub_min.txt") while True: ''' Parse paper title. Test for EOF. ''' line = input_file.readline().strip() if len(line) == 0: break assert line[:2] == "#*" title = line[2:] ''' Parse author. ''' line = input_file.readline().strip() assert line[:2] == "#@" authors = line[2:].split(',') ''' Parse Year ''' input_file.readline() ''' Parse Venue ''' line = input_file.readline().strip() assert line[:2] == "#c" venue = line[2:] ''' Parse paper id. Do not cast to integer. Simply unnecessary. ''' line = input_file.readline().strip() assert line[:6] == "#index" id = line[6:] id_title[id] = title for a in authors: dictionary_add_set(author_papers, a, id) dictionary_add_set(author_venues, a, venue) dictionary_add_set(venue_papers, venue, id) paper_venue[id] = venue paper_authors[id] = authors ''' Parse citations. ''' line = input_file.readline().strip() while line[:2] == "#%": ''' Invalid/empty citation. ''' if len(line) <= 2: break dictionary_add_set(paper_papers, id, line[2:]) line = input_file.readline().strip() ''' Read the empty string line so the readline output is not confused with EOF. Sets the reading pointer to the next paper's title line. ''' line = input_file.readline() if line[:2] == "#!": input_file.readline() ''' Get terms for each paper. ''' phrase_file = open("text_segmented_by_phrase.txt", "r") for paper_id, tok_list in id_phrases.items(): ''' Assuming (id, list_of_tokens). If I'm wrong, the code will HCF. ''' toks = [x for x in tok_list if len(x) > 2 and \ tfidf.tf_idf(x) > tfidf_threshold] toks = sorted(toks, key=lambda x: tfidf.tf_idf(x), reverse=False) paper_terms[paper_id] = toks[:min(3, len(toks))] for term in paper_terms[paper_id]: if not term_papers.has_key(term): term_papers[term] = [] term_papers[term].append(paper_id) return paper_authors, \ paper_papers, \ paper_venue, \ author_papers, \ venue_papers, \ author_venues
def reward2(s1, s2): indices = corpus[:] tfi = TFIDF() tfidf = tfi.get_tfidf(corpus) score = tfi.relevancy(tfidf, indices, s1, s2) return score + 1
def get_pred_api_set(desc): tfidf = TFIDF(desc).gen_vector() cluster = Match(tfidf).match() topN = TopN(cluster).get() return set(topN)
static.upload_folder(Sm_Cover_Dir, overwrite=True) static.upload_folder(Bg_Cover_Dir, overwrite=True) logger.info("update static server success !") with NewsDB() as db: db.update_table_newsContent(method="rebuild", fromCache=False) db.update_table_newsDetail(method="update")''' with NewsDB() as db: # 不更新 static 只更新 DB db.update_table_newsInfo(method="rebuild", fromCache=False) db.update_table_newsContent(method="rebuild", fromCache=False) db.update_table_newsDetail(method="update") WhooshIdx().create_idx() logger.info("update TFIDF ...") tfidf = TFIDF().init_for_update() tfidf.update() logger.info("update TFIDF success !") else: # 用于日常更新 with NewsDB() as db: db.update_table_newsInfo(fromCache=False) newsIDs = db.get_newsIDs() # 更新id后马上先更新静态服务器,避免在更新空档期用户访问图片造成404界面缓存 logger.info("update static server ...") static = StaticManager(newsIDs) static.download_covers() static.to_jpeg() static.cv_compress_sm() # 直接输出即可 static.cv_compress_bg()
from tfidf import TFIDF from match import Match from topN import TopN import sys desc = sys.argv[1] # online phase step 1 tfidf = TFIDF(desc).gen_vector() # online phase step 2 cluster = Match(tfidf).match() # online phase step 3 topN = TopN(cluster).get() for i in topN: print(i)
import pickle from TrainingWithTFIDF import TFIDFTrainer tfidfTrainer = TFIDFTrainer() from FeatureExtractionWithTFIDF import TFIDFPreparer from tfidf import TFIDF tfidfInstance = TFIDF() import nltk tfidfPreparer = TFIDFPreparer() class IntentDetector: def prepareForNLP(self, text): sentences = nltk.sent_tokenize(text) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] return sentences def getFilterChunk(self, sentence): chunkToExtract = """ pattern: {<NNP|NNS|NN><WDT>?<VBP|VBZ>?<JJR>?<IN><CD><CC>?<CD>?} """ parser = nltk.RegexpParser(chunkToExtract) result = parser.parse(sentence) chunks = [] for subtree in result.subtrees(): if subtree.label() == 'pattern':
def parseQuery(query, invIndex): #Both handlers return the respective TF_IDFs #docTF_IDF can be run once after crawl tempTFIDF = TFIDF() queryObj = Query(query)
def start(): #initialize TFIDF tfidf = TFIDF("tfidf_data/name_and_abstracts.txt") print("TFIDF initialized") #input_file = open("publications.txt") input_file = open("pub_min.txt") while True: ''' Parse paper title. Test for EOF. ''' line = input_file.readline().strip() if len(line) == 0: break assert line[:2] == "#*" title = line[2:] toks = word_tokenize(title) toks = sorted(toks, key=lambda x: tfidf.tf_idf(x), reverse=True) print "sorted toks:" + str(toks) ''' Parse author. ''' line = input_file.readline().strip() assert line[:2] == "#@" authors = line[2:].split(',') ''' Parse Year ''' input_file.readline() ''' Parse Venue ''' line = input_file.readline().strip() assert line[:2] == "#c" venue = line[2:] ''' Parse paper id. Do not cast to integer. Simply unnecessary. ''' line = input_file.readline().strip() assert line[:6] == "#index" id = line[6:] for a in authors: dictionary_add_set(author_papers, a, id) dictionary_add_set(author_venues, a, venue) dictionary_add_set(venue_papers, venue, id) paper_venue[id] = venue paper_authors[id] = authors ''' Parse citations. ''' line = input_file.readline().strip() while line[:2] == "#%": ''' Invalid/empty citation. ''' if len(line) <= 2: break dictionary_add_set(paper_papers, id, line[2:]) line = input_file.readline().strip() ''' Read the empty string line so the readline output is not confused with EOF. Sets the reading pointer to the next paper's title line. ''' line = input_file.readline() if line[:2] == "#!": input_file.readline() return paper_authors, \ paper_papers, \ paper_venue, \ author_papers, \ venue_papers, \ author_venues
if __name__ == '__main__': # Get command-line args args_ = get_setup_args() # Download resources download(args_) # Import spacy language model nlp = spacy.blank("en") # Keep all the docs for TF-IDF initilization tfidf_docs = [] # Preprocess dataset args_.train_file = url_to_data_path(args_.train_url) args_.dev_file = url_to_data_path(args_.dev_url) if args_.include_test_examples: args_.test_file = url_to_data_path(args_.test_url) glove_dir = url_to_data_path(args_.glove_url.replace('.zip', '')) glove_ext = '.txt' if glove_dir.endswith('d') else '.{}d.txt'.format( args_.glove_dim) args_.glove_file = os.path.join(glove_dir, os.path.basename(glove_dir) + glove_ext) pre_process(args_) from tfidf import TFIDF print(len(tfidf_docs)) tfidf_scorer = TFIDF(tfidf_docs) tfidf_scorer.prepare_data() tfidf_scorer.save_to_pickle()
def __init__(self): self.documents = {} self.tfidf = TFIDF()
def main(args): # Load TF-IDF from pickle scorer = TFIDF([]) scorer.get_from_pickle() # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get data loader log.info('Building dataset...') record_file = vars(args)['{}_record_file'.format(args.split)] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, char_vocab_size= 1376, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info('Loading checkpoint from {}...'.format(args.load_path)) model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Evaluate log.info('Evaluating on {} split...'.format(args.split)) nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)['{}_eval_file'.format(args.split)] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs,qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) if (args.use_tfidf): # Apply TF-IDF filtering to pred_dict tf_idf_threshold = 2 tf_idf_common_threshold = 1 for key, value in pred_dict.items(): if value != "": tf_idf_score = scorer.normalized_additive_idf_ignore_common_words( value, threshold_frequency=tf_idf_common_threshold) if tf_idf_score < tf_idf_threshold: pred_dict[key] = '' pass # print ("pred_dict: {}, pruned".format(tf_idf_score)) else: pass # print ("pred_dict: {}, kept".format(tf_idf_score)) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('{} {}'.format(args.split.title(), results_str)) # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info('Writing submission file to {}...'.format(sub_path)) with open(sub_path, 'w') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) # load weights into new self.model model.load_weights("model/model.h5") print("Loaded model from disk") sgd = SGD(lr=0.01) model.compile(loss='binary_crossentropy', optimizer=sgd) return getBinaryResult(model.predict_proba(np.array(x))) preproses() td = TFIDF([xdata, ydata]) # TRAINING # train(td.getOnlyX(), ydata) # RETRAINING # retrain_model(td.getOnlyX(), ydata) # TESTING test = "ahok itu pemimpin yang beres memimpin" print test print testFromTrained([td.transform(test)]) test = "ahok itu pemimpin yang ga beres memimpin" print test print testFromTrained([td.transform(test)])
def calcTFIDF(self): t = TFIDF() self.tfidf = t.docHandler(self.inverted_index, self.unique_id)
def __init__(self): self.preparation = Preparation() self.tfidf = TFIDF()
# read scrap_workbook scrap_workbook = read_scrap(args.scrap_file_name) # ## ES6 ES6_sheet = scrap_workbook["蔚来ES6"] review_container = ReviewContainer(ES6_sheet) review_list = review_container.get_review_list() doc_word_count_info_list = build_doc_word_count_info_list(review_list) ## build model data structure term_container = TermContainer(doc_word_count_info_list) inverted_file = InvertedFile(term_container, doc_word_count_info_list) # build query query_list = get_query_list(args.query_expand_workbook_path) query_expand_impl = QueryExpandImpl(args.query_expand_workbook_path) set_topk_for_query_list(query_list, args.topk) apply_query_expand_to_query_list(query_list, query_expand_impl) # search tfidf_engine = TFIDF(review_container) apply_query_search_to_query_list(query_list, inverted_file, tfidf_engine, review_container) # output_workbook workbook = Workbook() update_workbook_for_query_list(query_list, review_container, workbook) workbook.remove(workbook['Sheet']) workbook.save(args.output_path)
def __init__(self, queryString=""): print "Constructing Query Object!" self.invIndex = InvertedIndex() self.tfidf = TFIDF() self.query = queryString
def createTrainingSet(self): # initialize one class SVM models for each intent from sklearn import svm windowModel = svm.OneClassSVM(nu=0.01, kernel="linear") filterModel = svm.OneClassSVM(nu=0.01, kernel="linear") aggregateModel = svm.OneClassSVM(nu=0.01, kernel="linear") groupModel = svm.OneClassSVM(nu=0.01, kernel="linear") from tfidf import TFIDF tfidfInstance = TFIDF() documents = [] fdoc = [] adoc = [] wdoc = [] gdoc = [] import json with open('intents.json') as json_data: intentsData = json.load(json_data) for intent in intentsData['intents']: for pattern in intent['pattern']: documents.append(pattern) if intent['tag'] == "filter": fdoc.append(pattern) if intent['tag'] == "window": wdoc.append(pattern) if intent['tag'] == "aggre": adoc.append(pattern) if intent['tag'] == "group": gdoc.append(pattern) texts = [] # words relevant to the stream. These words do not help in intent detection and must be removed from FeatureExtractionWithTFIDF import TFIDFPreparer tfidfPreparer = TFIDFPreparer() for doc in documents: text = tfidfPreparer.prepareTextForTFIDF(doc) texts.append(text) self.countVectorizer, self.idf = tfidfInstance.getIDF(documents) self.tfidf_filter = tfidfInstance.getTFIDF(fdoc, self.countVectorizer, self.idf) self.tfidf_aggre = tfidfInstance.getTFIDF(adoc, self.countVectorizer, self.idf) self.tfidf_window = tfidfInstance.getTFIDF(wdoc, self.countVectorizer, self.idf) self.tfidf_group = tfidfInstance.getTFIDF(gdoc, self.countVectorizer, self.idf) x_filter = [] for i in range(len(fdoc)): total = tfidfPreparer.getSumOfCosineSimilarity( self.tfidf_filter[i], self.tfidf_filter) x_filter.append([total]) x_aggre = [] for i in range(len(adoc)): total = tfidfPreparer.getSumOfCosineSimilarity( self.tfidf_aggre[i], self.tfidf_aggre) x_aggre.append([total]) x_window = [] for i in range(len(wdoc)): total = tfidfPreparer.getSumOfCosineSimilarity( self.tfidf_window[i], self.tfidf_window) x_window.append([total]) x_group = [] for i in range(len(gdoc)): total = tfidfPreparer.getSumOfCosineSimilarity( self.tfidf_group[i], self.tfidf_group) x_group.append([total]) filterModel.fit(x_filter) windowModel.fit(x_window) aggregateModel.fit(x_aggre) groupModel.fit(x_group) import pickle filename = 'finalized_windowModel.sav' pickle.dump(windowModel, open(filename, 'wb')) filename = 'finalized_filterModel.sav' pickle.dump(filterModel, open(filename, 'wb')) filename = 'finalized_aggregateModel.sav' pickle.dump(aggregateModel, open(filename, 'wb')) filename = 'finalized_groupModel.sav' pickle.dump(groupModel, open(filename, 'wb'))
print(wikipedia[3][0]) #return wikipedia import copy #def get_corpus(): corpus = [news.text] titles = [soup.title] print len(corpus) for article in wikipedia: # if article[0] in topics: corpus.append(article[1]) titles.append(article[0]) print len(corpus) #return corpus, titles from tfidf import TFIDF #def get_sim_docs(): tfi = TFIDF() tfidf = tfi.get_tfidf(corpus) sim_docs = [] for index, score in tfi.similar_docs(tfidf, 0, 5): sim_docs.append((index, score)) print score, titles[index] print "Most relevant document is " + titles[sim_docs[0][0]] #return sim_docs