import concurrent.futures from tabulate import tabulate from timeit import default_timer as timer from dataUtils import DataUtils from dataAggregator import DataAggregator from baselines import MostPopularRecommender, MostRecentRecommender, MeanScoreRecommender from evalMethods import evaluate_recall, evaluate_arhr, evaluate_mse from contentbased import ContentBasedRecommender if __name__ == '__main__': DATANUM = 0 # number of files, 0 to load all K = 10 dataHelper = DataUtils() print("loading data...") entries = dataHelper.load_data("active1000", num=DATANUM) numLoaded = len(entries) print(f"loaded {numLoaded} events.") # Filter out unhelpful rows print("filtering data...") filtered_data = dataHelper.filter_data(entries) numLost = numLoaded - len(filtered_data) print( f"filtered to {len(filtered_data)} events. {numLost} events ({numLost / numLoaded:%}) discarded." ) # Re-index document and user IDs to start at 0 and be sequential, manually #print("re-indexing data...")
""" @version: 1.0 @author: Jie Lin @Mail: [email protected] @file: dataUtils.py @time: 09/27/2018 4:08pm @purpose: this files contain a main method to call each function in class DataUtils in order to clean data @code environment: ubuntu 18.01 """ from dataUtils import DataUtils # the main method to call each function in Datautils class if __name__ == '__main__': # to call a DataUtils class dataUtil = DataUtils() # pass data address to class to get a dataframe messyData = dataUtil.readFiles('./SIIM2016_Messy_Fake_EMRdata.csv') print(messyData) #print data under resource column and show without duplicate with the number of each kind of data print("showing data under resource:") print(dataUtil.printCDataWoutDu(messyData, "resource")) print() #select TRCT1 under resource column print("showing data with resource name = \'TRCT1\'") resourceTRCT1 = dataUtil.findDataUnCol(messyData, "TRCT1", "resource") print(resourceTRCT1) print()