def clean_database(): # check if different .txt path has been provided if args.supp_txt_path: lastfm_utils.set_txt_path(args.supp_txt_path) # check if user provided a .csv folder or a .db file (if using .csv, load .csv into LastFm2Pandas; otherwise, load .db into LastFm) if os.path.isdir(args.input): try: tags = pd.read_csv(os.path.join(args.input, 'lastfm_tags.csv')) except FileNotFoundError: raise FileNotFoundError('Please make sure {} contains a file "lastfm_tags.csv".'.format(args.input)) try: tids = pd.read_csv(os.path.join(args.input, 'lastfm_tids.csv')) except FileNotFoundError: raise FileNotFoundError('Please make sure {} contains a file "lastfm_tids.csv".'.format(args.input)) try: tid_tag = pd.read_csv(os.path.join(args.input, 'lastfm_tid_tag.csv')) except FileNotFoundError: raise FileNotFoundError('Please make sure {} contains a file "lastfm_tid_tag.csv".'.format(args.input)) lastfm = LastFm2Pandas.load_from(tags=tags, tids=tids, tid_tag=tid_tag) else: lastfm = LastFm(args.input) df = lastfm_utils.generate_final_df(lastfm) df.reset_index(drop=True, inplace=True) # sanity check df.index += 1 assert all(df.columns == ['tag', 'merge_tags']) # sanity check # generate tables which will go into output database tags = df['tag'].str.lower() print('Matching all tags to the "clean" few ones...', end=' ', flush=True) tag_tag = create_tag_tag_table(lastfm, df) print('done') print('Matching all tids to tags...', end=' ', flush=True) tid_tag = create_tid_tag_table(lastfm, tag_tag, args.val) print('done') print('Purging tids...', end=' ', flush=True) tids = tid_tag['tid'].drop_duplicates() tids.index = tids.values tids = tids.map(lastfm.tid_num_to_tid).reindex(pd.RangeIndex(1, len(lastfm.get_tid_nums())+1)) print('done') return lastfm.LastFm2Pandas.load_from(tags=tags, tids=tids, tid_tag=tid_tag) # wrap into LastFm2Pandas class