def test_from_pickle(): setup_pickle_file(SMALL_ROW_SIZE) pandas_df = pandas.read_pickle(TEST_PICKLE_FILENAME) modin_df = pd.read_pickle(TEST_PICKLE_FILENAME) assert modin_df_equals_pandas(modin_df, pandas_df) teardown_pickle_file()
""" With multiprocessing using Dask""" print("Recommending") topn = 500 sleep(0.2) vec_bow = id2word_dictionary.doc2bow(context_list) # This line takes a LONG time: it has to map to each of the 300 topics vec_ldamallet = ldamallet[vec_bow] # Convert the query to LDA space sims = malletindex[vec_ldamallet] sims = sorted(enumerate(sims), key=lambda item: -item[1])[:topn] # sims is a list of tuples of (docid -- line num in original training file, probability) return [docid_to_magid.get(docid) for docid, prob in sims] #df = pd.read_pickle('recommendationsids.pickle') '''df = pd.read_pickle('/home/ashwath/Programs/ACLAAn/Pickles/recommendations_aclmag_3models_500_df.pickle') malletindex = similarities.MatrixSimilarity.load('/home/ashwath/Programs/ACLAAn/LDA/simIndexAcl.index') with open('/home/ashwath/Programs/ACLAAn/LDA/docid_to_magid_training_acl.pickle', 'rb') as pick: docid_to_magid = pickle.load(pick) id2word_dictionary = corpora.Dictionary.load('/home/ashwath/Programs/ACLAAn/LDA/aclmag.dict') corpus = corpora.MmCorpus('/home/ashwath/Programs/ACLAAn/LDA/aclmag_bow_corpus.mm') ldamallet = LdaMallet.load('/home/ashwath/Programs/ACLAAn/LDA/lda_model.model') df['lda_recommendations'] = df['context_for_lda'].progress_apply(lda_recommend) df['lda_binary'] = df[['ground_truth', 'lda_recommendations']].apply( lambda x: binarize_predictions(x.ground_truth, x.lda_recommendations), axis=1) df.to_pickle('/home/ashwath/Programs/ACLAAn/Pickles/malletrecommendations_aclmag_500_df.pickle') df.to_csv('/home/ashwath/Programs/ACLAAn/Evaluation/malletrecommendations_aclmag_500.tsv', sep='\t')''' sleep(0.3) df = pd.read_pickle('/home/ashwath/Programs/ACLAAn/Pickles/malletrecommendations_aclmag_500_df.pickle') calculate_metrics(df[['lda_recommendations', 'lda_binary', 'ground_truth']])
if get_data: #function format(propertyTypes,minBedrooms,minBathrooms,minCarspaces,minPrice,maxPrice,minLandArea,state,region,area,suburb,includeSurroundingSuburbs) #dont use zeros here yet needs to be fixed df = domain().listing_results(["House"],2,1,1,500000,550000,10,"VIC","","","",False) print("Main frame shape: ",df.shape) df.to_csv('data/{version}_data.csv'.format(version=version), sep='\t', encoding='utf-8') df.to_pickle('data/{version}_data.pkl'.format(version=version)) #dup_df_2 = df[df['id'].duplicated() == True] #dup_df_2 = dup_df_2.sort_values(by=['id']) #r, c = dup_df_2.shape #if r > 0: # print("duplicates in df") else: df = pd.read_pickle(static_pkl) if compute_features: features = processing.compute_features(df) print(features) '''['BuiltInWardrobes', 'SecureParking', 'AirConditioning', 'Ensuite', 'Gas', 'Heating', 'Dishwasher', 'BalconyDeck', 'InternalLaundry', 'PetsAllowed', 'Bath', 'Study', 'FullyFenced', 'Floorboards', 'BroadbandInternetAccess', 'GardenCourtyard', 'AlarmSystem', 'Shed', 'Gym', 'Intercom', 'SolarPanels', 'WaterViews', 'Furnished', 'NorthFacing', 'SwimmingPool', 'RainwaterStorageTank', 'CableOrSatellite', 'GroundFloor', 'SolarHotWater', 'TennisCourt', 'OutdoorSpa', 'DoubleGlazedWindows', 'WallCeilingInsulation', 'SeparateDiningRoom', 'IndoorSpa']''' else: pass df = processing.feature_score(df,feature_ranking) print(df) df.to_csv('data/{version}_data_featurescored.csv'.format(version=version), sep='\t', encoding='utf-8')
def predict(filename): """ Get the recommendations using the 3 methods and put them in a dataframe""" ''' df = pd.read_csv(filename, sep='\t', names=['ground_truth', 'citing_acl_id', 'context']) print("Read file") #df = df.head() # Convert cited mag ids to a list df['ground_truth'] = df['ground_truth'].astype('str').apply(lambda x: x.split(',')) df['context_for_lda'] = df['context'].apply(lda_preprocessing) print("Created lda contexts") sleep(0.3) # clean_text is present in hyperdoc2vec df['context'] = df['context'].apply(clean_text) print('Cleaned contexts') sleep(0.3) df['wordcount'] = df['context'].apply(lambda x: len(x.split())) # Remove contexts with less than 8 words df = df[df.wordcount>8] df['hd2v_recommendations'] = df['context'].apply(hd2v_recommend) print('hd2v recommendations done') sleep(0.3) df['bm25_recommendations'] = df['context'].apply(solr_recommend) sleep(0.3) print('solr recommendations done') # LDA Recommendations take a long time, parallelize #ddask = dd.from_pandas(df, npartitions=64) #ddask['lda_recommendations'] = ddask['context_for_lda'].apply(lda_recommend) df['lda_recommendations'] = df['context_for_lda'].apply(lda_recommend) print('lda recommendations done') ''' df = pd.read_pickle('recommendationsids.pickle') print('read pickle') #df.to_pickle('recommendationsids.pickle') #sleep(0.3) df['hd2v_binary'] = df[['ground_truth', 'hd2v_recommendations']].apply( lambda x: binarize_predictions(x.ground_truth, x.hd2v_recommendations), axis=1) print(df.hd2v_binary) sleep(0.3) df['lda_binary'] = df[['ground_truth', 'lda_recommendations']].apply( lambda x: binarize_predictions(x.ground_truth, x.lda_recommendations), axis=1) sleep(0.3) df['bm25_binary'] = df[['ground_truth', 'bm25_recommendations']].apply( lambda x: binarize_predictions(x.ground_truth, x.bm25_recommendations), axis=1) # NOTE: for 500 recommendations, ground truth is present in hd2v's recommendations 909 times, lda 1146 times, bm25 1543 times. # (total: 2819) -- 32.24%, 40.65%, 54.74% # t = df.bm25_binary.apply(lambda x: 1 in x or 2 in x) print("Binarized") df.to_pickle( '/home/ashwath/Programs/ACLAAn/Pickles/recommendations_aclmag_3models_500_df.pickle' ) df.to_csv( '/home/ashwath/Programs/ACLAAn/Evaluation/recommendations_aclmag_3models_500.tsv', sep='\t') print("Prediction done") calculate_metrics(df[[ 'hd2v_recommendations', 'lda_recommendations', 'bm25_recommendations', 'hd2v_binary', 'lda_binary', 'bm25_binary', 'ground_truth' ]])
'/home/ashwath/Programs/MAGCS/Pickles/paperwisemetrics_magLDA_3models_df.pickle' ) print("METRICS CALCULATED, time to calculate the means") # Get the mean of all the index columns # First, drop list columns. df = df.drop(['lda_recommendations', 'lda_binary', 'ground_truth'], axis=1) mean_series = df.mean() mean_series.to_csv( '/home/ashwath/Programs/MAGCS/Evaluation/meanmetrics_mag_lda.tsv', sep='\t', index=True, header=False) print("C'est fini.") df = pd.read_pickle( '/home/ashwath/Programs/MAGCS/MAG-hyperdoc2vec/recommendationsids.pickle') df['lda_recommendations'] = df['context_for_lda'].progress_apply(lda_recommend) print('lda recommendations done') df['lda_binary'] = df[['ground_truth', 'lda_recommendations']].apply( lambda x: binarize_predictions(x.ground_truth, x.lda_recommendations), axis=1) sleep(0.3) print("Binarized") df.to_pickle( '/home/ashwath/Programs/MAGCS/Pickles/ldarecommendations_mag_3models_500_df.pickle' ) calculate_metrics(df[['lda_recommendations', 'lda_binary', 'ground_truth']])