Example #1
0
def process_pdb_file(cfg, gau_tpl_content, pdb_file):
    with open(pdb_file) as d:
        mol_num = 0
        pdb_atom_line = []
        for line in d.readlines():
            pdb_section = line[:PDB_LINE_TYPE_LAST_CHAR]
            if pdb_section == 'MODEL ':
                mol_num += 1
            elif pdb_section == 'ATOM  ' or pdb_section == 'HETATM':
                element = line[
                    PDB_BEFORE_ELE_LAST_CHAR:PDB_ELE_LAST_CHAR].strip()
                if element == '':
                    element = line[PDB_ATOM_NUM_LAST_CHAR:
                                   PDB_ATOM_TYPE_LAST_CHAR].strip()
                pdb_xyz = line[PDB_MOL_NUM_LAST_CHAR:PDB_Z_LAST_CHAR]
                pdb_atom_line.append(["{:6}".format(element), pdb_xyz])
            elif pdb_section == 'END\n':
                if mol_num == 0:
                    mol_id = ''
                else:
                    mol_id = '_' + str(mol_num)
                d_out = create_out_fname(pdb_file, suffix=mol_id, ext='.com')
                if cfg[REMOVE_H]:
                    del pdb_atom_line[-1]
                list_to_file(
                    gau_tpl_content[SEC_HEAD] + pdb_atom_line +
                    gau_tpl_content[SEC_TAIL], d_out)
                if cfg[NUM] and mol_num >= cfg[NUM]:
                    return
                pdb_atom_line = []
def check_and_print(cfg, atom_id, pdb_tpl_content, gausslog_file, pdb_data_section, f_name, mode, message):
    # Check Num atoms and print
    if cfg[PDB_TPL_FILE]:
        if atom_id != pdb_tpl_content[NUM_ATOMS]:
            raise InvalidDataError('In gausslog file: {}\nfound {} atoms, while the pdb template has {} atoms' 
                                   'atoms'.format(gausslog_file, atom_id, pdb_tpl_content[NUM_ATOMS]))
    list_to_file(pdb_tpl_content[SEC_HEAD] + pdb_data_section + pdb_tpl_content[SEC_TAIL],
                 f_name, list_format=PDB_FORMAT, mode=mode, print_message=message)
Example #3
0
 def testMakeScanPlot(self):
     list_fname = os.path.join(SUB_DATA_DIR, "scan_list.txt")
     fnames = [
         os.path.join(SUB_DATA_DIR, "pet_dimer_scan_neg_tzvp.log"),
         os.path.join(SUB_DATA_DIR, "pet_dimer_scan_pos_tzvp.log")
     ]
     list_to_file(fnames, list_fname)
     out_png_fname = os.path.join(SUB_DATA_DIR, "pet_dimer_scan.png")
     silent_remove(out_png_fname)
     test_input = ["-l", list_fname, "--scan", out_png_fname]
     try:
         # main(test_input)
         with capture_stdout(main, test_input) as output:
             self.assertTrue("Barriers" in output)
         self.assertTrue(os.path.isfile(out_png_fname))
     finally:
         silent_remove(list_fname, disable=DISABLE_REMOVE)
         silent_remove(out_png_fname, disable=DISABLE_REMOVE)
         pass
def create_com_from_pdb_str(pdb_str, gau_tpl_content, com_fname):
    """
    Extracts one set of pdb coordinates from the "pdb_str" and combines with
    :param pdb_str: str in pdb format
    :param gau_tpl_content: dict with contents of the Gaussian template file
    :param com_fname: str, name of file to be created
    :return:
    """
    coord_list = []
    pdb_str_list = pdb_str.split("\n")
    for line in pdb_str_list:
        if line.startswith('ATOM') or line.startswith('HETATM'):
            element = line[PDB_BEFORE_ELE_LAST_CHAR:PDB_ELE_LAST_CHAR].strip()
            pdb_xyz = line[PDB_MOL_NUM_LAST_CHAR:PDB_Z_LAST_CHAR]
            coord_list.append(["{:6}".format(element), pdb_xyz])
        elif line.startswith('CONECT') or line.startswith('END'):
            break
    list_to_file(gau_tpl_content[SEC_HEAD] + coord_list +
                 gau_tpl_content[SEC_TAIL],
                 com_fname,
                 print_message=False)
Example #5
0
    def testMismatchScanFiles(self):
        # no_scan_log = os.path.join(SUB_DATA_DIR, 'me2propprpnt_7.log')
        # test_input = ["-f", no_scan_log, "--scan", "test.png"]
        # # main(test_input)
        # with capture_stderr(main, test_input) as output:
        #     self.assertTrue("Did not find expected parameter scan info" in output)

        list_fname = os.path.join(SUB_DATA_DIR, "scan_list.txt")
        fnames = [
            os.path.join(SUB_DATA_DIR, "tieg4pdc1scan.log"),
            os.path.join(SUB_DATA_DIR, "pet_dimer_scan_pos_tzvp.log")
        ]
        list_to_file(fnames, list_fname)
        out_png_fname = os.path.join(SUB_DATA_DIR, "pet_dimer_scan.png")
        test_input = ["-l", list_fname, "--scan", out_png_fname]
        try:
            # main(test_input)
            with capture_stderr(main, test_input) as output:
                self.assertTrue("cannot" in output)
        finally:
            silent_remove(list_fname, disable=DISABLE_REMOVE)
            silent_remove(out_png_fname, disable=DISABLE_REMOVE)
            pass
def process_gausscom_file(cfg, gausscom_file, pdb_tpl_content):
    with open(gausscom_file) as d:
        if cfg[PDB_TPL_FILE]:
            pdb_data_section = copy.deepcopy(pdb_tpl_content[SEC_ATOMS])
        else:
            pdb_data_section = []
        section = SEC_HEAD
        atom_id = 0

        for line in d:
            line = line.strip()
            # not currently keeping anything from the header; just check num atoms
            if section == SEC_HEAD:
                # there may be some instructions (which start with %, and can have some blank lines) before the
                #    "route card lines" (which start with #)
                while not GAU_HEADER_PAT.match(line):
                    line = next(d).strip()
                # skip first line of route card
                line = next(d).strip()
                # for "route card" and then description, there may be more than one header line; look for blank line
                for i in range(2):
                    while len(line) > 0:
                        line = next(d).strip()
                    # now move past the blank line, and get the content of the following line
                    line = next(d).strip()
                # now on charge, multiplicity line, which we also skip with the "continue"
                section = SEC_ATOMS
                continue

            elif section == SEC_ATOMS:
                if len(line) == 0:
                    # Since the tail will come only from the template, nothing more is needed after reading atoms
                    break
                split_line = line.split()

                atom_type = split_line[0]
                # if working from a template, check atom type
                if cfg[PDB_TPL_FILE]:
                    try:
                        pdb_atom_type = pdb_data_section[atom_id][8].split(
                            ' ')[-1]
                    except IndexError:
                        raise InvalidDataError(
                            'Gausscom file: {}\n   has more atoms than the expected {} atoms in '
                            'the template file: {}'.format(
                                gausscom_file, pdb_tpl_content[NUM_ATOMS],
                                cfg[PDB_TPL_FILE]))
                    if atom_type != pdb_atom_type:
                        warning(
                            "Atom types do not match for atom number {}; pdb atom type is {} while gausscom type "
                            "is {}".format(atom_id, pdb_atom_type, atom_type))
                else:
                    pdb_data_section.append(atom_id)
                    pdb_data_section[atom_id] = [
                        'HETATM', '{:5d}'.format(atom_id + 1),
                        ' {:4} '.format(atom_type), 'UNL  ', 1, 0.0, 0.0, 0.0,
                        '  1.00  0.00          {:>2}'.format(atom_type)
                    ]
                pdb_data_section[atom_id][5:8] = map(float, split_line[1:4])
                atom_id += 1

    # Now that finished reading the file, first make sure didn't exit before reaching the desired number of atoms
    if cfg[PDB_TPL_FILE]:
        if atom_id != pdb_tpl_content[NUM_ATOMS]:
            raise InvalidDataError(
                'In gausscom file: {}\n  found {} atoms, while the pdb template has {} atoms'
                .format(gausscom_file, atom_id, pdb_tpl_content[NUM_ATOMS]))
    f_name = create_out_fname(gausscom_file,
                              ext='.pdb',
                              base_dir=cfg[OUT_BASE_DIR])
    list_to_file(pdb_tpl_content[SEC_HEAD] + pdb_data_section +
                 pdb_tpl_content[SEC_TAIL],
                 f_name,
                 list_format=PDB_FORMAT)
Example #7
0
def print_results(log_info,
                  list_of_conf_lists,
                  sort_by_enthalpy,
                  sort_by_energy,
                  max_diff=None,
                  print_winners=True,
                  out_fname=DEF_OUT_NAME):
    winners = []
    warn_files_str = ''
    for conf_list in list_of_conf_lists:
        if len(conf_list) == 1:
            low_conv_log = conf_list[0]
        else:
            lowest_converg = 20000000.0
            low_conv_log = None  # here to make IDE happy
            for log_file in conf_list:
                if log_info[log_file][CONVERG] < lowest_converg:
                    lowest_converg = log_info[log_file][CONVERG]
                    low_conv_log = log_file
        winners.append(
            (low_conv_log, log_info[low_conv_log][CONVERG],
             log_info[low_conv_log][ENERGY], log_info[low_conv_log][ENTHALPY],
             log_info[low_conv_log][GIBBS]))

    # sorting, if requested
    sort_error = False
    if sort_by_enthalpy:
        sort_by_energy = False
        for winner in winners:
            if isnan(winner[3]):
                sort_by_energy = True
                sort_by_enthalpy = False
                break
    if sort_by_enthalpy:
        sort_key = 3
    elif sort_by_energy:
        sort_key = 2
    else:
        sort_key = 4
    winners.sort(key=lambda tup: tup[sort_key])
    winner_str = quote('","'.join(['File', CONVERG, ENERGY, ENTHALPY, GIBBS]))

    # now gather results
    cutoff_list = []
    if max_diff:
        winner_str += ',"Diff(kcal/mol)"\n'
        lowest_val = winners[0][sort_key]
        if sort_by_enthalpy:
            sort_type = "enthalpy"
        elif sort_by_energy:
            sort_type = "SCF energy"
        else:
            sort_type = "Gibbs free energy"
        winner_str += f'"Files within {sort_type} cutoff of {max_diff:.2f} kcal/mol"\n'
        within_cutoff = True
    else:
        winner_str += '\n'
        lowest_val = None  # to make IDE happy
        within_cutoff = False
    val_diff_str = ""
    val_diff = 0.
    for winner, converg, energy, enthalpy, gibbs, in winners:
        if not sort_error:
            if max_diff:
                if sort_by_enthalpy:
                    val_diff = (enthalpy - lowest_val) * EHPART_TO_KCAL_MOL
                elif sort_by_energy:
                    val_diff = (energy - lowest_val) * EHPART_TO_KCAL_MOL
                else:
                    val_diff = (gibbs - lowest_val) * EHPART_TO_KCAL_MOL
                val_diff_str = f",{val_diff:.2f}"

            if within_cutoff:
                if val_diff > max_diff:
                    winner_str += f'"Files outside of cutoff:"\n'
                    within_cutoff = False
                else:
                    cutoff_list.append(winner)

            winner_str += f'"{winner}",{converg:.4f},{energy:.6f},{enthalpy:.6f},{gibbs:.6f}{val_diff_str}\n'
        if log_info[winner][CONVERG_ERR]:
            warn_files_str += '\n    {:}:  {:.2f}'.format(winner, converg)
        elif log_info[winner][CONVERG_ERR] is None:
            warn_files_str += '\n    {:}:  Not found'.format(winner)
    if print_winners:
        print(winner_str)

    if cutoff_list:
        list_to_file(cutoff_list, out_fname)
    return winner_str, warn_files_str
def run_job(job, job_name_perhaps_with_dir, tpl_dict, cfg, testing_mode):
    # Determine if it will run fresh or from an old checkpoint
    if job == '':
        new_job_name = tpl_dict[JOB_NAME]
        tpl_dict[INPUT_FILE] = job_name_perhaps_with_dir + cfg[GAUSS_IN_EXT]
        if cfg[FIRST_JOB_CHK]:
            tpl_dict[OLD_CHECK_ECHO] = cfg[OLD_CHECK_ECHO].format(
                cfg[FIRST_JOB_CHK])
        else:
            tpl_dict[OLD_CHECK_ECHO] = ''
    else:
        new_job_name = tpl_dict[JOB_NAME] + '_' + job
        tpl_dict[OLD_JOB_NAME] = tpl_dict[JOB_NAME]
        tpl_dict[OLD_CHECK_ECHO] = cfg[OLD_CHECK_ECHO].format(
            tpl_dict[OLD_JOB_NAME])
        tpl_dict[INPUT_FILE] = cfg[TPL_DICT][job]

    tpl_file = cfg[JOB_RUN_TPL]
    job_runner_fname = create_out_fname(new_job_name,
                                        ext=".sh",
                                        base_dir=cfg[OUT_DIR])
    print("Running {}".format(new_job_name))

    tpl_dict[JOB_NAME] = new_job_name
    for key_name in [
            USER,
            MEM,
            PROC_LIST,
    ]:
        if key_name in cfg:
            tpl_dict[key_name] = cfg[key_name]

    tpl_str = read_tpl(tpl_file)
    # if either MEM or PROC_LIST is the default (Nonetype), and is used to run the job, get info from the node before
    #    creating the job script
    mem_required = '{' + MEM + '}' in tpl_str
    get_mem = mem_required and not tpl_dict[MEM]

    proc_required = '{' + PROC_LIST + '}' in tpl_str
    get_proc = proc_required and not tpl_dict[PROC_LIST]

    default_gauss_required = '{' + DEF_ROUTE + '}' in tpl_str

    num_procs = 1  # to make IDE happy
    proc_list = '0'  # to make IDE happy

    if get_mem or get_proc or default_gauss_required:
        # explicitly check each possible required info flag, because any or all can be requested
        if testing_mode:
            hostname = subprocess.check_output(["echo", "r1i7n35"
                                                ]).decode("utf-8").strip()
        else:
            #  Will not be covered in testing mode, as is not part of written code to be tested
            hostname = subprocess.check_output(["hostname"
                                                ]).decode("utf-8").strip()
        print(
            "Obtaining available memory and/or number of processors on node {}.\n    "
            "Note: this program assumes the whole node will be allocated to Gaussian.\n"
            .format(hostname))
        if get_mem:
            tpl_dict[MEM] = get_node_mem(testing_mode)

        max_cache = 1024 * 1024  # to make IDE happy; Gaussian default (conservative) is 1024 * 1024
        if get_proc or default_gauss_required:
            num_procs, proc_list, max_cache = get_proc_info(testing_mode)
        if get_proc:
            tpl_dict[PROC_LIST] = proc_list
            print(
                "    Found {} processors. Will allow use of cpus {}.\n".format(
                    num_procs, proc_list))

        if get_mem or get_proc:
            print(
                "    The user may override these values by specifying the '{}' and/or '{}' keywords in the "
                "configuration file.\n    Be sure to use the formatting Gaussian expects.\n"
                .format(MEM, PROC_LIST))

        if default_gauss_required:
            max_disk = get_max_disk(testing_mode)
            max_cache = int(max_cache)
            print(
                "Since '{}' found in the {}, read machine specs to determine CacheSize={} and "
                "MaxDisk={}".format(DEF_ROUTE, JOB_RUN_TPL, max_cache,
                                    max_disk))
            default_route_list = [
                "-#- CacheSize={}".format(max_cache),
                "-#- MaxDisk={}".format(max_disk)
            ]
            fname = create_out_fname('Default.Route',
                                     base_dir=cfg[SCRATCH_DIR])
            list_to_file(default_route_list, fname)
            tpl_dict[
                DEF_ROUTE] = ''  # there is an action triggered, not a value needed, so replaced with blank space

    move_on = False
    while not move_on:
        try:
            fill_save_tpl(tpl_str, tpl_dict, tpl_file, job_runner_fname)
            move_on = True
        except KeyError as e:
            missing_key = e.args[0].split("\'")[1]
            if missing_key in cfg:
                tpl_dict[missing_key] = cfg[missing_key]
            else:
                raise e
    subprocess.call(["chmod", "+x", job_runner_fname])
    if testing_mode:
        print(
            "Testing mode; did not run job script or check Gaussian output for normal termination.\n"
        )
    else:
        # do not want this tested, as actually running Gaussian would take too long, and not what should be tested
        p1 = subprocess.Popen(job_runner_fname)
        p1.wait()
        out_file = tpl_dict[JOB_NAME] + ".log"
        last_line = subprocess.check_output(["tail", "-1",
                                             out_file]).strip().decode("utf-8")
        if GAU_GOOD_PAT.match(last_line):
            print("Successfully completed {}\n".format(out_file))
            os.remove(job_runner_fname)
        else:
            raise InvalidDataError('Job failed: {}'.format(out_file))
Example #9
0
def get_thermochem(file_set, results_dict, save_vibes, out_dir,
                   tog_output_fname, qh_h_opt, write_mode):
    """
    Calls GoodVibes to get thermochem at a range of temps
    :param file_set: list of reactant file(s), TS file (or separator), and optionally products
    :param results_dict: dictionary of results from running hartree and goodvibes
    :param save_vibes: boolean to determine whether to save each GoodVibes output separately
    :param out_dir: directory to save GoodVibes output files (if requested)
    :param tog_output_fname: None or string (file name) if saving each GoodVibes output together
    :param qh_h_opt: boolean to use the '-q' option in GoodVibes (corrections to both entropy and enthalpy)
    :param write_mode: boolean to start a new to add to an all-together goodvibes output file
    :return: nothing
    """
    h = []
    qh_h = []
    gt = []
    qh_gt = []
    temps = []
    for index, file in enumerate(file_set):
        base_name = os.path.basename(file)
        if file == REACT_PROD_SEP:
            h.append(np.full([len(temps)], np.nan))
            qh_h.append(np.full([len(temps)], np.nan))
            gt.append(np.full([len(temps)], np.nan))
            qh_gt.append(np.full([len(temps)], np.nan))
            continue
        vibes_out = results_dict[base_name][GOODVIBES_OUT]
        found_structure = False
        skip_line = True
        h.append([])
        qh_h.append([])
        gt.append([])
        qh_gt.append([])
        # we know the last line should be dropped, and at least the first 10
        for line in vibes_out[10:-2]:
            if GOODVIBES_ERROR_PAT.match(line):
                raise InvalidDataError(
                    "See GoodVibes output: {}".format(vibes_out))
            if not found_structure:
                if GOODVIBES_DATA_PAT.match(line):
                    found_structure = True
                    continue
            elif skip_line:
                skip_line = False
                continue
            else:
                vals = line.split()
                if index == 0:
                    temps.append(float(vals[1]))
                h[index].append(float(vals[2]))
                if qh_h_opt:
                    qh_h[index].append(float(vals[3]))
                gt[index].append(float(vals[-2]))
                qh_gt[index].append(float(vals[-1]))
        if save_vibes:
            vibes_out_fname = os.path.relpath(
                create_out_fname(file,
                                 suffix='_vibes',
                                 base_dir=out_dir,
                                 ext='.dat'))
            list_to_file(vibes_out, vibes_out_fname, print_message=False)
            print('Saved GoodVibes output as: {}'.format(vibes_out_fname))
        if tog_output_fname:
            list_to_file(vibes_out,
                         tog_output_fname,
                         mode=write_mode,
                         print_message=False)
            if write_mode == 'w':
                print("Adding all GoodVibes output to: {}".format(
                    tog_output_fname))
                write_mode = "a"

    temps = np.asarray(temps)
    # for each molecule, multiply the array to convert to kcal/mol
    for index in range(len(gt)):
        h[index] = np.asarray(h[index]) * EHPART_TO_KCAL_MOL
        if qh_h_opt:
            qh_h[index] = np.asarray(qh_h[index]) * EHPART_TO_KCAL_MOL
        gt[index] = np.asarray(gt[index]) * EHPART_TO_KCAL_MOL
        qh_gt[index] = np.asarray(qh_gt[index]) * EHPART_TO_KCAL_MOL

    return temps, h, qh_h, gt, qh_gt
def process_gausslog_file(gausslog_file, com_tpl_content, charge_from_log_flag,
                          find_low_energy, step_num, base_dir, out_fname):
    with open(gausslog_file) as d:
        rel_path_fname = os.path.relpath(gausslog_file)
        # The header may be more than 5 lines long--counting from end makes sure the comment goes in the correct line
        if find_low_energy:
            com_tpl_content[SEC_HEAD][
                -3] = "Low energy conformation from file {}".format(
                    rel_path_fname)
        elif step_num:
            step_num = int(step_num)
            com_tpl_content[SEC_HEAD][
                -3] = "Conformation from step number {} in file {}".format(
                    step_num, rel_path_fname)
        else:
            com_tpl_content[SEC_HEAD][
                -3] = "Last conformation from file {}".format(rel_path_fname)
        lowest_energy_found = 0.0
        current_step_num = None
        final_atoms_section = []
        atom_type_list = []
        section = SEC_HEAD
        atom_id = 0
        # so don't change the flag that is passed it, so if there is another log file it will also be checked
        if not charge_from_log_flag:
            find_charge = True
        else:
            find_charge = False

        for line in d:
            line = line.strip()
            if len(line) == 0:
                continue
            # not currently keeping anything from the header
            if section == SEC_HEAD:
                if find_charge:
                    if GAU_CHARGE_PAT.match(line):
                        charge_mult = []
                        while find_charge:
                            split_line = line.split('=')
                            charge_mult.append('{}  {}'.format(
                                int(split_line[1].split()[0]),
                                int(split_line[2].split()[0])))
                            line = next(d).strip()
                            if not GAU_CHARGE_PAT.match(line):
                                if len(charge_mult) > 1:
                                    section = SEC_INITIAL_COORDINATES
                                    final_atoms_section = []
                                    # already reading the next section, so grab the needed info
                                    atom_type_list.append(line.split()[0])
                                com_tpl_content[SEC_HEAD][-1] = '   '.join(
                                    charge_mult)
                                find_charge = False
                        continue
                if step_num and GAU_STEP_PAT.match(line):
                    split_line = line.split()
                    current_step_num = int(split_line[2])
                    if current_step_num == step_num:
                        break

                if GAU_COORD_PAT.match(line):
                    atoms_section = []
                    next(d)
                    next(d)
                    section = SEC_ATOMS
                    continue

            elif section == SEC_INITIAL_COORDINATES:
                while len(line) > 0:
                    # originally just added whole line to final. Then found that this section prints fewer sig figs
                    #   than the coordinate section, so taking those instead
                    atom_type_list.append(line.split()[0])
                    line = next(d).strip()
                while not GAU_COORD_PAT.match(line):
                    line = next(d).strip()
                next(d)
                next(d)
                line = next(d).strip()
                while not GAU_SEP_PAT.match(line):
                    split_line = line.split()
                    atom_xyz = ["{:>12}".format(x) for x in split_line[3:6]]
                    final_atoms_section.append(
                        '{:16}'.format(atom_type_list[atom_id]) +
                        ' '.join(atom_xyz))
                    atom_id += 1
                    line = next(d).strip()
                break
            elif section == SEC_ATOMS:
                if GAU_SEP_PAT.match(line):
                    section = SEC_TAIL
                    continue

                split_line = line.split()
                try:
                    atom_type = ATOM_NUM_DICT[int(split_line[1])]
                except KeyError:
                    raise InvalidDataError(
                        "Currently, this code only expects atom numbers up to 36 (Kr), and the "
                        "atomic number read was {}. Update the code to use this with your current "
                        "output.".format(split_line[1]))
                if com_tpl_content[NUM_ATOMS]:
                    com_atom_type = re.split(
                        '[ (]',
                        com_tpl_content[SEC_ATOMS][atom_id])[0].strip()
                    if com_atom_type != atom_type:
                        try:
                            if ATOM_NUM_DICT[int(com_atom_type)] != atom_type:
                                raise ValueError
                        except ValueError:
                            raise InvalidDataError(
                                "For atom number {}, {} has atom type '{}', while the template has "
                                "atom type '{}'".format(
                                    atom_id + 1, gausslog_file, atom_type,
                                    com_atom_type))
                    atom_type = com_tpl_content[SEC_ATOMS][
                        atom_id]  # This keeps the "fragment" number if there
                atom_type = '{:16}'.format(atom_type)

                atom_xyz = ["{:>12}".format(x) for x in split_line[3:6]]
                atoms_section.append(atom_type + ''.join(atom_xyz))
                atom_id += 1
            elif section == SEC_TAIL:
                if com_tpl_content[
                        NUM_ATOMS] and atom_id != com_tpl_content[NUM_ATOMS]:
                    raise InvalidDataError(
                        'In gausslog file: {}\n  found {} atoms, but the tpl expects '
                        '{} atoms'.format(gausslog_file, atom_id,
                                          com_tpl_content[NUM_ATOMS]))
                if GAU_E_PAT.match(line):
                    if find_low_energy:
                        split_line = line.split()
                        energy = float(split_line[4])
                        if energy < lowest_energy_found:
                            final_atoms_section = atoms_section[:]
                    else:
                        final_atoms_section = atoms_section[:]
                    section = SEC_HEAD
                    atom_id = 0

    if len(final_atoms_section) == 0:
        raise InvalidDataError(
            "Check that the following log file has coordinates to use and/or specified step "
            "number: {}".format(gausslog_file))
    if out_fname:
        f_name = create_out_fname(out_fname, base_dir=base_dir)

    else:
        f_name = create_out_fname(gausslog_file,
                                  suffix='_' + com_tpl_content[BASE_NAME],
                                  ext='.com',
                                  base_dir=base_dir)
    list_to_file(
        com_tpl_content[SEC_HEAD] + final_atoms_section +
        com_tpl_content[SEC_TAIL], f_name)
def process_gausscom_file(gausscom_file, tpl_com_content, read_new_charge, out_dir):
    # to make the later part easier to read
    tpl_atoms = tpl_com_content[SEC_ATOMS]
    tpl_atom_types = tpl_com_content[ATOM_TYPES]
    tpl_atom_num = len(tpl_atom_types)
    with open(gausscom_file) as d:
        section = SEC_HEAD
        atom_id = 0
        atom_content = []

        try:
            for line in d:
                line = line.strip()
                # not currently keeping anything from the header; just check num atoms
                if section == SEC_HEAD:
                    # there may be some instructions (which start with %, and can have some blank lines) before the
                    #    "route card lines" (which start with #)
                    while not GAU_HEADER_PAT.match(line):
                        line = next(d).strip()
                    # skip first line of route card
                    line = next(d).strip()
                    # for "route card" and then description, there may be more than one header line; look for blank line
                    for i in range(2):
                        while len(line) > 0:
                            line = next(d).strip()
                        # now move past the blank line, and get the content of the following line
                        line = next(d).strip()
                    # now on charge, multiplicity line, which we also skip unless we use its charge/mult
                    if read_new_charge:
                        # make sure reading a valid charge/mult line, with at least 2 integers
                        try:
                            charge_mult = line.split()
                            int(charge_mult[0])
                            int(charge_mult[1])
                            if len(charge_mult) % 2 != 0:
                                raise IndexError
                        except (IndexError, ValueError):
                            raise InvalidDataError("Problem while reading file: {}\nOption to read charge and "
                                                   "multiplicity from template not chosen, but found invalid data on "
                                                   "the expected line: {}".format(os.path.basename(gausscom_file),
                                                                                  line))
                        tpl_com_content[SEC_HEAD][-1] = line
                    section = SEC_ATOMS
                    continue

                elif section == SEC_ATOMS:
                    # stay in atom section until a blank line is reached
                    while len(line) > 0:
                        split_line = line.split()
                        # if there is a freeze/no freeze col, will be 5 columns (split by ' '); Keep atom info together
                        if len(split_line) == 5:
                            atom_info = "{:2}{:>8}".format(split_line[0], split_line[1])
                        else:
                            atom_info = split_line[0]

                        # if template has atoms, check atom type
                        if tpl_atom_num > 0:
                            atom_type = atom_info.split()[0].split('(')[0]
                            if atom_type != tpl_atom_types[atom_id]:
                                raise InvalidDataError("Problem while reading file: {}\nAtom types do not match for "
                                                       "atom number {}: file has type {} while tpl has type "
                                                       "{}".format(os.path.basename(gausscom_file), atom_id + 1,
                                                                   tpl_atom_types[atom_id], atom_type))
                            atom_info = tpl_atoms[atom_id]

                        atom_xyz = ["{:>12}".format(x) for x in split_line[-3:]]
                        atom_content.append('{:18}'.format(atom_info) + '  '.join(atom_xyz))
                        atom_id += 1
                        line = next(d).strip()
                    # Don't need to read the tail, because we won't use it
                    break
        except StopIteration:
            pass
        except UnicodeDecodeError:
            raise InvalidDataError(f"Error in reading file: {gausscom_file}\n           Exiting program.")

        # now loop is done; check atom number if atoms are in the tpl file
        check_num_atoms(atom_id, gausscom_file, tpl_atom_num)

        f_name = create_out_fname(gausscom_file, ext='.com', base_dir=out_dir)
        list_to_file(tpl_com_content[SEC_HEAD] + atom_content + tpl_com_content[SEC_TAIL], f_name)