Ejemplo n.º 1
0
def initial_load_jhu_files(session):
    # Get the files to process
    files = get_files(JHU_DATA_DIRECTORY)

    # Get the last date already processed.  Don't want to re-do any data.
    try:
        last_ordinal_date_processed = session.query(LastDate).one().ordinal_date
    except NoResultFound:
        # Nothing in the DB so far means we haven't processed any files.
        last_ordinal_date_processed = 0

    # For each file get the associated date.
    for filename in files:
        ordinal_date = filename_to_ordinal_date(filename)
        # Have we already processed this one?
        if ordinal_date <= last_ordinal_date_processed:
            if not args.load_missing_files:
                continue
            if not date_is_missing_data(session, ordinal_date):
                continue

        # The [:-7] on the datetime.now() strips off the microseconds
        print("Processing", ordinal_date_to_string(ordinal_date),
              str(datetime.now())[:-7])
        # clean_partial_load(ordinal_date)
        process_one_jhu_file(session,
                             filename,
                             ordinal_date)
    print(str(datetime.now())[:-7])
Ejemplo n.º 2
0
def evaluate(out_file):
    file_folder = "corpus_files"
    th1, en1, _ = fhand.get_files(file_folder)

    file_folder2 = "correct_files"
    th2, en2, _ = fhand.get_files(file_folder2)
    th1, en1, th2, en2 = check_in_both(th1, en1, th2, en2)
    for i in range(len(th1)):
        th1[i] = file_folder + "\\" + th1[i]
    for i in range(len(en1)):
        en1[i] = file_folder + "\\" + en1[i]
    for i in range(len(th2)):
        th2[i] = file_folder2 + "\\" + th2[i]
    for i in range(len(en2)):
        en2[i] = file_folder2 + "\\" + en2[i]

    calc_with_text(en1, en2, th1, th2, out_file)
Ejemplo n.º 3
0
def test_get_files(filepaths):
    salt_f = filepaths[0]
    # some nonexistent files
    no_exist = Path(__file__).parent / 'test_not_exist.txt'
    paths = ((salt_f,), (no_exist, filepaths[1]), (no_exist, filepaths[2]), (no_exist, filepaths[3]))
    files = fh.get_files(paths)
    assert not files[1]
    assert files[0][1:] == tuple(Path(no_exist) for _ in range(3))
    # assert files[2] == (0, 0, 0, 0)
    assert len(files[0][0]) == 16

    # what if files did exist
    salt = fh.generate_salt(salt_f)
    for file in filepaths[1:]:
        f = open(file, 'w')
        f.close()
    files = fh.get_files(paths)
    # now we should find files
    assert files[1]
    assert files[0][0] == salt
    assert files[0][1:] == tuple(Path(file).resolve() for file in filepaths[1:])
Ejemplo n.º 4
0
def test_cogs():
    file_folder = "corpus_files"
    th1, en1, _ = fhand.get_files(file_folder)

    #    file_folder2 = "correct_files"
    #    th2,en2,_ = fhand.get_files(file_folder2)
    #    th1,en1,th2,en2 = check_in_both(th1,en1,th2,en2)

    for i in range(len(th1)):
        th1[i] = file_folder + "\\" + th1[i]
    for i in range(len(en1)):
        en1[i] = file_folder + "\\" + en1[i]


#    for i in range(len(th2)):
#        th2[i] = file_folder2 + "\\" + th2[i]
#    for i in range(len(en2)):
#        en2[i] = file_folder2 + "\\" + en2[i]

#print(en1,en2,th1,th2)
    calc_with_text(en1, th1)
Ejemplo n.º 5
0
def refresh_files(session):
    """
    JHU frequently changes a line or two in old files, to correct existing
    data.  Check for files which have been modified since initially loaded,
    and updates those.
    :param session: The SQLalchemy session
    :return: None
    :side_effect: The database MAY BE updated.
    """
    files = get_files(JHU_DATA_DIRECTORY)
    any_refreshed = False
    for filename in files:
        if not needs_refreshing(session, filename):
            continue
        any_refreshed = True
        refresh_lines(session, filename)
        record_file_mtime(session, filename)
        # Commit the changes from this file.
        session.commit()
    if not any_refreshed:
        print("All files were up to date.")
Ejemplo n.º 6
0
def connect_to_pm_dbs(default_is_ok: bool =True, force_connect: bool =False) -> Optional[PM]:
    filepaths, exists = get_files(fl.paths, default_is_ok)
    if exists or force_connect:
        return PM(filepaths[0], DB_auth(filepaths[1]), DB_keys(filepaths[2]), DB_password(filepaths[3]))
    return None
Ejemplo n.º 7
0
def main():
    if __name__== "__main__" :
        th,en,singles = fhand.get_files(file_folder)
        process_singles(singles)
        process_pairs(th,en)