def initial_load_jhu_files(session): # Get the files to process files = get_files(JHU_DATA_DIRECTORY) # Get the last date already processed. Don't want to re-do any data. try: last_ordinal_date_processed = session.query(LastDate).one().ordinal_date except NoResultFound: # Nothing in the DB so far means we haven't processed any files. last_ordinal_date_processed = 0 # For each file get the associated date. for filename in files: ordinal_date = filename_to_ordinal_date(filename) # Have we already processed this one? if ordinal_date <= last_ordinal_date_processed: if not args.load_missing_files: continue if not date_is_missing_data(session, ordinal_date): continue # The [:-7] on the datetime.now() strips off the microseconds print("Processing", ordinal_date_to_string(ordinal_date), str(datetime.now())[:-7]) # clean_partial_load(ordinal_date) process_one_jhu_file(session, filename, ordinal_date) print(str(datetime.now())[:-7])
def evaluate(out_file): file_folder = "corpus_files" th1, en1, _ = fhand.get_files(file_folder) file_folder2 = "correct_files" th2, en2, _ = fhand.get_files(file_folder2) th1, en1, th2, en2 = check_in_both(th1, en1, th2, en2) for i in range(len(th1)): th1[i] = file_folder + "\\" + th1[i] for i in range(len(en1)): en1[i] = file_folder + "\\" + en1[i] for i in range(len(th2)): th2[i] = file_folder2 + "\\" + th2[i] for i in range(len(en2)): en2[i] = file_folder2 + "\\" + en2[i] calc_with_text(en1, en2, th1, th2, out_file)
def test_get_files(filepaths): salt_f = filepaths[0] # some nonexistent files no_exist = Path(__file__).parent / 'test_not_exist.txt' paths = ((salt_f,), (no_exist, filepaths[1]), (no_exist, filepaths[2]), (no_exist, filepaths[3])) files = fh.get_files(paths) assert not files[1] assert files[0][1:] == tuple(Path(no_exist) for _ in range(3)) # assert files[2] == (0, 0, 0, 0) assert len(files[0][0]) == 16 # what if files did exist salt = fh.generate_salt(salt_f) for file in filepaths[1:]: f = open(file, 'w') f.close() files = fh.get_files(paths) # now we should find files assert files[1] assert files[0][0] == salt assert files[0][1:] == tuple(Path(file).resolve() for file in filepaths[1:])
def test_cogs(): file_folder = "corpus_files" th1, en1, _ = fhand.get_files(file_folder) # file_folder2 = "correct_files" # th2,en2,_ = fhand.get_files(file_folder2) # th1,en1,th2,en2 = check_in_both(th1,en1,th2,en2) for i in range(len(th1)): th1[i] = file_folder + "\\" + th1[i] for i in range(len(en1)): en1[i] = file_folder + "\\" + en1[i] # for i in range(len(th2)): # th2[i] = file_folder2 + "\\" + th2[i] # for i in range(len(en2)): # en2[i] = file_folder2 + "\\" + en2[i] #print(en1,en2,th1,th2) calc_with_text(en1, th1)
def refresh_files(session): """ JHU frequently changes a line or two in old files, to correct existing data. Check for files which have been modified since initially loaded, and updates those. :param session: The SQLalchemy session :return: None :side_effect: The database MAY BE updated. """ files = get_files(JHU_DATA_DIRECTORY) any_refreshed = False for filename in files: if not needs_refreshing(session, filename): continue any_refreshed = True refresh_lines(session, filename) record_file_mtime(session, filename) # Commit the changes from this file. session.commit() if not any_refreshed: print("All files were up to date.")
def connect_to_pm_dbs(default_is_ok: bool =True, force_connect: bool =False) -> Optional[PM]: filepaths, exists = get_files(fl.paths, default_is_ok) if exists or force_connect: return PM(filepaths[0], DB_auth(filepaths[1]), DB_keys(filepaths[2]), DB_password(filepaths[3])) return None
def main(): if __name__== "__main__" : th,en,singles = fhand.get_files(file_folder) process_singles(singles) process_pairs(th,en)