# compare allboxfiles to list in store of already processed files -- not done yet... ###################################### processed = pd.read_csv(processed_file) processed = processed[['file_id', 'raw_processed_date']].copy() files4process = pd.merge(allboxfiles, processed, on='file_id', how='left') files4process = files4process.loc[ files4process.raw_processed_date.isnull()].copy() files4process.drop(columns=['raw_processed_date'], inplace=True) # files4process=allboxfiles #first time around grab everything # download everything not already processed to cache - originally there # should be len(allboxfiles.file_id)=3506 as of 5/22/19 minus 2 with # duplicate filenames. Updates will have far fewer # only a handful of really young kids ran this battery box.download_files(files4process.file_id) # one file was thwarting naming convention and goofing up read row # functions...figure out how to be more flexible.... # need to convert the windows text file to unix (extra iconv step because # no BOM) for file in files4process.filename: myCmd = 'iconv -f UTF-16 -t UTF-8 ' + cache_space + '/' + \ file + ' | dos2unix > ' + cache_space + '/Utf8_unix_' + file print('Running system command: ' + myCmd) os.system(myCmd) # now read each file into a row (create row from txt file) rows = pd.DataFrame() for file in files4process.filename: test = os.popen('tail -20 ' + cache_space + '/Utf8_unix_' + file).read()
#start with data that is result of extensive QC effort from sites. #keep track of expected and observed IDs #curate list of TBX issues. #pull in data (by ID) that not on list of issues #get list of filenames ########################## Harvard=84800505740 harvardfiles, harvardfolders=foldercontents(Harvard) harvardfiles2, harvardfolders2=folderlistcontents(harvardfolders.foldername,harvardfolders.folder_id) harvardfiles=pd.concat([harvardfiles,harvardfiles2],axis=0,sort=True) data4process=harvardfiles.loc[harvardfiles.filename.str.contains('Data')==True] scores4process=harvardfiles.loc[harvardfiles.filename.str.contains('Score')==True] box.download_files(data4process.file_id) box.download_files(scores4process.file_id) hdatainit=catcontents(data4process,box_temp) hscoreinit=catcontents(scores4process,box_temp) #check if these are empty dlist,slist=findwierdos(hdatainit,hscoreinit) #if so, just delete any identical duplicates if dlist.empty and slist.empty: hdatainit=hdatainit.drop_duplicates(subset={'PIN','Inst','ItemID','Position'},keep='first') hscoreinit=hscoreinit.drop_duplicates(subset={'PIN','Inst'}) else: print('Found Non-Identical Duplications') print(dlist)