コード例 #1
0
# compare allboxfiles to list in store of already processed files -- not done yet...
######################################
processed = pd.read_csv(processed_file)
processed = processed[['file_id', 'raw_processed_date']].copy()

files4process = pd.merge(allboxfiles, processed, on='file_id', how='left')
files4process = files4process.loc[
    files4process.raw_processed_date.isnull()].copy()
files4process.drop(columns=['raw_processed_date'], inplace=True)
# files4process=allboxfiles #first time around grab everything
# download everything not already processed to cache - originally there
# should be len(allboxfiles.file_id)=3506 as of 5/22/19 minus 2 with
# duplicate filenames.  Updates will have far fewer
# only a handful of really young kids ran this battery
box.download_files(files4process.file_id)
# one file was thwarting naming convention and goofing up read row
# functions...figure out how to be more flexible....

# need to convert the windows text file to unix (extra iconv step because
# no BOM)
for file in files4process.filename:
    myCmd = 'iconv -f UTF-16 -t UTF-8 ' + cache_space + '/' + \
        file + ' | dos2unix > ' + cache_space + '/Utf8_unix_' + file
    print('Running system command: ' + myCmd)
    os.system(myCmd)

# now read each file into a row (create row from txt file)
rows = pd.DataFrame()
for file in files4process.filename:
    test = os.popen('tail -20 ' + cache_space + '/Utf8_unix_' + file).read()
コード例 #2
0
#start with data that is result of extensive QC effort from sites.
#keep track of expected and observed IDs
#curate list of TBX issues.
#pull in data (by ID) that not on list of issues


#get list of filenames
##########################
Harvard=84800505740
harvardfiles, harvardfolders=foldercontents(Harvard)
harvardfiles2, harvardfolders2=folderlistcontents(harvardfolders.foldername,harvardfolders.folder_id)
harvardfiles=pd.concat([harvardfiles,harvardfiles2],axis=0,sort=True)

data4process=harvardfiles.loc[harvardfiles.filename.str.contains('Data')==True]
scores4process=harvardfiles.loc[harvardfiles.filename.str.contains('Score')==True]
box.download_files(data4process.file_id)
box.download_files(scores4process.file_id)

hdatainit=catcontents(data4process,box_temp)
hscoreinit=catcontents(scores4process,box_temp)

#check if these are empty
dlist,slist=findwierdos(hdatainit,hscoreinit)

#if so, just delete any identical duplicates
if dlist.empty and slist.empty:
    hdatainit=hdatainit.drop_duplicates(subset={'PIN','Inst','ItemID','Position'},keep='first')
    hscoreinit=hscoreinit.drop_duplicates(subset={'PIN','Inst'})
else:
    print('Found Non-Identical Duplications')
    print(dlist)