# __author__ = 'basil.beirouti' import csv, datetime, random from BM25 import last_thousand from BM25.Scheduling import whos_on, read_filtered_csv_file, read_raw_schedule_csv, write_filtered_csv_file, this_year, docmatrix_data from BM25.Plugins import tuples_tse_psums_concat from BM25.TextCleaning import wordslist2string, cleanStringAndLemmatize from BM25.BM25Okapi import QueryMaster, DocMatrix def rand_divide(data, proportion): lendata = len(data) numgroup1 = round(proportion*lendata) numgroup2 = lendata-numgroup1 random.shuffle(data) random.shuffle(data) group1 = data[0:numgroup1] group2 = data[numgroup1:] assert(numgroup1 == len(group1)) assert(numgroup2 == len(group2)) return group1, group2 rows = read_filtered_csv_file() on_now = whos_on(rows) personids = [el[1] for el in on_now] out = docmatrix_data(personids, 500) srnums, badgenums, personids, fns, lns, psums, dates = zip(*out) tupsdata = [(el[3] + el[4], el[5]) for el in out] cleaned_data= [(el[0], wordslist2string(cleanStringAndLemmatize(el[1]))) for el in tupsdata] train, test = rand_divide(cleaned_data, 0.75) processed_data = tuples_tse_psums_concat(train) DocMatrix(processed_data, )
random.shuffle(data) random.shuffle(data) group1 = data[0:numgroup1] group2 = data[numgroup1:] assert(numgroup1 == len(group1)) assert(numgroup2 == len(group2)) return group1, group2 train1, test1 = rand_divide(data_vmax, 0.75) train2, test2 = rand_divide(data_vnx, 0.75) train = train1 + train2 test = test1 + test2 print("divided data into training and testing sets") tups_train = tuples_tse_psums_concat(train) tups_train2 = tse_psums_concat(train) print("grouped problem summaries by TSE") # evaluator = Bm25Eval(tups_train, test) # print("running evaluation") # evaluator.evaluatealgorithm() def testfunction(tups_train): okapi_docmatrix = DocMatrix(tups_train, bm25 = True, ngrams_range = (1,1)) query_master = QueryMaster(okapi_docmatrix) start = time.time() query_master.evaluatealgorithm(test, 1) query_master.evaluatealgorithm(test, 10) stop = time.time()