def parse_band_gaps(lines, initial_lineno):
    """read band gap information

    Note: this is new for CRYSTAL17

    Parameters
    ----------
    lines: list[str]
    initial_lineno: int

    Returns
    -------
    ParsedSection

    """
    band_gaps = {}

    for k, line in enumerate(lines[initial_lineno:]):
        curr_lineno = initial_lineno + k
        line = line.strip()
        # TODO breaking line?
        # TODO use regex:
        # re.compile(r"(DIRECT|INDIRECT) ENERGY BAND GAP:\s*([.\d]*)",
        #            re.DOTALL),
        if "BAND GAP" in line:
            if fnmatch(line.strip(), "ALPHA BAND GAP:*eV"):
                bgvalue = split_numbers(line)[0]
                bgtype = "alpha"
            elif fnmatch(line.strip(), "BETA BAND GAP:*eV"):
                bgvalue = split_numbers(line)[0]
                bgtype = "beta"
            elif fnmatch(line.strip(), "BAND GAP:*eV"):
                bgvalue = split_numbers(line)[0]
                bgtype = "all"
            else:
                return ParsedSection(
                    initial_lineno,
                    band_gaps,
                    "found a band gap of unknown format at line {0}: {1}".
                    format(curr_lineno, line),
                )
            if bgtype in band_gaps:
                return ParsedSection(
                    initial_lineno,
                    band_gaps,
                    "band gap data already contains {0} value before line {1}: {2}"
                    .format(bgtype, curr_lineno, line),
                )
            band_gaps[bgtype] = bgvalue

    return ParsedSection(initial_lineno, band_gaps)
def parse_symmetry_section(data, initial_lineno, line, lines):
    """update dict with symmetry related variables

    Parameters
    ----------
    data: dict
        existing data to add the geometry data to
    initial_lineno: int
    line: str
        the current line
    lines: list[str]

    """
    if fnmatch(line, "*SYMMOPS - TRANSLATORS IN FRACTIONAL UNITS*"):
        nums = split_numbers(line)
        if not len(nums) == 1:
            raise IOError(
                "was expecting a single number, representing the number of symmops, on this line:"
                " {0}, got: {1}".format(initial_lineno, line))
        nsymmops = int(nums[0])
        if not fnmatch(
                lines[initial_lineno + 1],
                "*MATRICES AND TRANSLATORS IN THE CRYSTALLOGRAPHIC REFERENCE FRAME*",
        ):
            raise IOError(
                "was expecting CRYSTALLOGRAPHIC REFERENCE FRAME on this line"
                " {0}, got: {1}".format(initial_lineno + 1,
                                        lines[initial_lineno + 1].strip()))
        if not fnmatch(lines[initial_lineno + 2],
                       "*V*INV*ROTATION MATRICES*TRANSLATORS*"):
            raise IOError("was expecting symmetry headers on this line"
                          " {0}, got: {1}".format(
                              initial_lineno + 2,
                              lines[initial_lineno + 2].strip()))
        symmops = []
        for j in range(nsymmops):
            values = split_numbers(lines[initial_lineno + 3 + j])
            if not len(values) == 14:
                raise IOError(
                    "was expecting 14 values for symmetry data on this line"
                    " {0}, got: {1}".format(
                        initial_lineno + 3 + j,
                        lines[initial_lineno + 3 + j].strip()))
            symmops.append(values[2:14])
        data["primitive_symmops"] = symmops
def parse_crystal_ppan(content):
    """Parse CRYSTAL Mulliken Population outputs (PPAN.DAT)

    Parameters
    ----------
    content: str

    Notes
    -----

    Format:

    ::

        NSPIN,NATOM
        IAT,NSHELL
        Xiat,Yiat,Ziat (AU)
        QTOT shell charges
        NORB
        orbital charges
    """
    spin_names = ["alpha+beta_electrons", "alpha-beta_electrons"]
    data = {}
    lines = content.splitlines()
    line = _new_line(lines)
    nspin, natoms = split_numbers(line)
    for spin_num in range(int(nspin)):
        spin_name = spin_names[spin_num]
        spin_data = data.setdefault(spin_name, {"atoms": []})
        for atom_num in range(int(natoms)):
            line = _new_line(lines)
            atomic_number, nshell = split_numbers(line)
            line = _new_line(lines)
            coordinate = split_numbers(line)
            line = _new_line(lines)
            values = split_numbers(line)
            total_charge = values[0]
            shell_charges = values[1:]
            while len(shell_charges) < nshell:
                line = _new_line(lines)
                shell_charges.extend(split_numbers(line))
            line = _new_line(lines)
            (norbitals, ) = split_numbers(line)
            orbital_charges = []
            while len(orbital_charges) < norbitals:
                line = _new_line(lines)
                orbital_charges.extend(split_numbers(line))
            spin_data["atoms"].append({
                "atomic_number": atomic_number,
                "coordinate": coordinate,
                "total_charge": total_charge,
                "shell_charges": shell_charges,
                "orbital_charges": orbital_charges,
            })
        spin_data["summed_charge"] = sum(a["total_charge"]
                                         for a in spin_data["atoms"])
    return data
def initial_parse(lines):
    """Scan the file for errors, and find the final elapsed time value."""
    errors = []
    warnings = []
    parser_errors = []
    mpi_abort = False
    telapse_line = None
    start_lines = {}
    found_endprop = False

    for lineno, line in enumerate(lines):

        if "WARNING" in line.upper():
            warnings.append(line.strip())
        elif "ERROR" in line:
            # TODO ignore errors before program execution (e.g. in mpiexec setup)?
            if "open_hca: getaddr_netdev ERROR" not in line:
                errors.append(line.strip())
        elif "MPI_Abort" in line:
            # only record one mpi_abort event (to not clutter output)
            if not mpi_abort:
                errors.append(line.strip())
                mpi_abort = True
        elif "CONVERGENCE TESTS UNSATISFIED" in line.upper():
            errors.append(line.strip())
        elif "TELAPSE" in line:
            telapse_line = lineno
        elif line.strip().startswith("ENDPROP"):
            found_endprop = True

    total_seconds = None
    if telapse_line:
        total_seconds = int(split_numbers(lines[telapse_line].split("TELAPSE")[1])[0])
        # m, s = divmod(total_seconds, 60)
        # h, m = divmod(m, 60)
        # elapsed_time = "%d:%02d:%02d" % (h, m, s)

    if not found_endprop:
        # TODO separate exit code?
        parser_errors.append("No ENDPROP found in stdout")

    return errors, warnings, parser_errors, total_seconds, start_lines
def parse_scf_final_energy(lines, initial_lineno, final_lineno=None):
    """read post initial scf data

    Parameters
    ----------
    lines: list[str]
    initial_lineno: int

    Returns
    -------

    """
    scf_energy = {}
    for i, line in enumerate(lines[initial_lineno:]):
        if final_lineno is not None and i + initial_lineno == final_lineno:
            return ParsedSection(final_lineno, scf_energy)
        if line.strip().startswith("TTTTTTT") or line.strip().startswith(
                "******"):
            return ParsedSection(final_lineno, scf_energy)
        if fnmatch(line.strip(), "TOTAL ENERGY*DE*"):
            if not fnmatch(line.strip(), "TOTAL ENERGY*AU*DE*"):
                raise IOError("was expecting units in a.u. on line:"
                              " {0}, got: {1}".format(initial_lineno + i,
                                                      line))
            if "total_corrected" in scf_energy:
                raise IOError("total corrected energy found twice, on line:"
                              " {0}, got: {1}".format(initial_lineno + i,
                                                      line))
            scf_energy["total_corrected"] = convert_units(
                split_numbers(line)[1], "hartree", "eV")

    return ParsedSection(
        final_lineno,
        scf_energy,
        "Did not find end of Post SCF section (starting on line {})".format(
            initial_lineno),
    )
def read_gaussian_cube(handle, return_density=False, dist_units="angstrom"):
    """Parse gaussian cube files to a data structure.

    The specification can be found at:
    http://h5cube-spec.readthedocs.io/en/latest/cubeformat.html

    CRYSTAL outputs include DENSCUBE.DAT, SPINCUBE.DAT, POTCUBE.DAT.

    Parameters
    ----------
    handle : file-like
        an open file handle
    return_density : bool
        whether to read and return the density values
    dist_units : str
        the distance units to return

    Returns
    -------
    aiida_crystal17.parsers.raw.gaussian_cube.GcubeResult

    """
    in_dunits = "bohr"

    header = [handle.readline().strip(), handle.readline().strip()]
    settings = split_numbers(handle.readline().strip())

    if len(settings) > 4 and settings[4] != 1:
        # TODO implement NVAL != 1
        raise NotImplementedError("not yet implemented NVAL != 1")

    natoms = settings[0]
    origin = convert_units(np.array(settings[1:4]), in_dunits, dist_units)
    if natoms < 0:
        # TODO implement DSET_IDS
        raise NotImplementedError("not yet implemented DSET_IDS")
    an, ax, ay, az = split_numbers(handle.readline().strip())
    bn, bx, by, bz = split_numbers(handle.readline().strip())
    cn, cx, cy, cz = split_numbers(handle.readline().strip())

    voxel_cell = convert_units(
        np.array([[ax, ay, az], [bx, by, bz], [cx, cy, cz]]), in_dunits,
        dist_units)

    avec = convert_units(np.array([ax, ay, az]) * an, in_dunits, dist_units)
    bvec = convert_units(np.array([bx, by, bz]) * bn, in_dunits, dist_units)
    cvec = convert_units(np.array([cx, cy, cz]) * cn, in_dunits, dist_units)

    atomic_numbers = []
    nuclear_charges = []
    ccoords = []
    for _ in range(int(natoms)):
        anum, ncharge, x, y, z = split_numbers(handle.readline().strip())
        atomic_numbers.append(int(anum))
        nuclear_charges.append(ncharge)
        ccoord = convert_units(np.asarray([x, y, z]), in_dunits,
                               dist_units) - origin
        ccoords.append(ccoord.tolist())

    density = None
    if return_density:
        values = []
        for line in handle:
            values += line.split()
        density = np.array(values, dtype=float).reshape(
            (int(an), int(bn), int(cn)))

    return GcubeResult(
        header,
        [avec.tolist(), bvec.tolist(),
         cvec.tolist()],
        voxel_cell.tolist(),
        [int(an), int(bn), int(cn)],
        origin.tolist(),
        ccoords,
        nuclear_charges,
        atomic_numbers,
        {
            "conversion": "CODATA2014",
            "length": dist_units
        },
        density,
    )
Exemple #7
0
def parse_crystal_fort25(content):
    """Parse the fort.25 output from CRYSTAL.

    Notes
    -----
    File Format:

    ::

        1ST RECORD : -%-,IHFERM,TYPE,NROW,NCOL,DX,DY,COSXY (format : A3,I1,A4,2I5,1P,(3E12.5))
        2ND RECORD : X0,Y0 (format : 1P,6E12.5)
        3RD RECORD : I1,I2,I3,I4,I5,I6 (format : 6I3)
        4TH RECORD
        AND FOLLOWING : ((RDAT(I,J),I=1,NROW),J=1,NCOL) (format : 1P,6E12.5)

        Meaning of the variables:
        1   NROW            1 (DOSS are written one projection at a time)
            NCOL            number of energy points in which the DOS is calculated
            DX              energy increment (hartree)
            DY              not used
            COSXY           Fermi energy (hartree)
        2   X0              energy corresponding to the first point
            Y0              not used
        3   I1              number of the projection;
            I2              number of atomic orbitals of the projection;
            I3,I4,I5,I6     not used
        4   RO(J),J=1,NCOL  DOS: density of states ro(eps(j)) (atomic units).

    """
    system_type = None
    fermi_energy = None
    energy_delta = None
    initial_energy = None
    len_dos = None
    alpha_projections = {}
    beta_projections = {}
    proj_number = 0

    lines = content.splitlines()
    lineno = 0

    while lineno < len(lines):
        line = lines[lineno].strip()

        if line.startswith("-%-"):
            proj_number += 1

            if system_type is None:
                system_type = line[3]
            elif not system_type == line[3]:
                raise IOError(
                    "projection {0} has different system type ({1}) to previous ({2})".format(
                        proj_number, line[3], system_type
                    )
                )

            if not line[4:8] == "DOSS":
                raise IOError("projection {0} is not of type DOSS".format(proj_number))

            nrows, ncols, _, denergy, fermi = split_numbers(line[8:])
            # nrows, ncols = (int(nrows), int(ncols))

            if energy_delta is None:
                energy_delta = denergy
            elif not energy_delta == denergy:
                raise IOError(
                    "projection {0} has different delta energy ({1}) to previous ({2})".format(
                        proj_number, denergy, energy_delta
                    )
                )
            if fermi_energy is None:
                fermi_energy = fermi
            elif not fermi_energy == fermi:
                raise IOError(
                    "projection {0} has different fermi energy ({1}) to previous ({2})".format(
                        proj_number, fermi, fermi_energy
                    )
                )

            lineno += 1
            line = lines[lineno].strip()

            ienergy = split_numbers(line)[1]

            if initial_energy is None:
                initial_energy = ienergy
            elif not initial_energy == ienergy:
                raise IOError(
                    "projection {0} has different initial energy ({1}) to previous ({2})".format(
                        proj_number, ienergy, initial_energy
                    )
                )

            lineno += 1
            line = lines[lineno].strip()

            projid, norbitals, _, _, _, _ = [int(i) for i in line.split()]

            lineno += 1
            line = lines[lineno].strip()

            dos = []
            while not line.startswith("-%-"):
                dos += split_numbers(line)
                if lineno + 1 >= len(lines):
                    break
                lineno += 1
                line = lines[lineno].strip()

            if len_dos is None:
                len_dos = len(dos)
            elif not len_dos == len(dos):
                raise IOError(
                    "projection {0} has different dos value lengths ({1}) to previous ({2})".format(
                        proj_number, len(dos), len_dos
                    )
                )

            if projid not in alpha_projections:
                alpha_projections[projid] = {
                    "id": projid,
                    "norbitals": norbitals,
                    "dos": dos,
                }
            elif projid in beta_projections:
                raise IOError(
                    "three data sets with same projid ({0}) were found".format(projid)
                )
            else:
                beta_projections[projid] = {
                    "id": projid,
                    "norbitals": norbitals,
                    "dos": dos,
                }
        else:
            lineno += 1

    system_type = IHFERM_MAP[int(system_type)]
    fermi_energy = convert_units(float(fermi_energy), "hartree", "eV")

    energy_delta = convert_units(float(energy_delta), "hartree", "eV")
    initial_energy = convert_units(float(initial_energy), "hartree", "eV")
    len_dos = int(len_dos)
    energies = np.linspace(
        initial_energy, initial_energy + len_dos * energy_delta, len_dos
    ).tolist()

    total_alpha = None
    total_beta = None
    if alpha_projections:
        total_alpha = alpha_projections.pop(max(alpha_projections.keys()))
    if beta_projections:
        total_beta = beta_projections.pop(max(beta_projections.keys()))

    return {
        "units": {"conversion": "CODATA2014", "energy": "eV"},
        "energy": energies,
        "system_type": system_type,
        "fermi_energy": fermi_energy,
        "total_alpha": total_alpha,
        "total_beta": total_beta,
        "projections_alpha": list(alpha_projections.values())
        if alpha_projections
        else None,
        "projections_beta": list(beta_projections.values())
        if beta_projections
        else None,
    }
def parse_scf_section(lines, initial_lineno, final_lineno=None):
    """read scf data

    Parameters
    ----------
    lines: list[str]
    initial_lineno: int
    final_lineno: int or None

    Returns
    -------
    ParsedSection

    """
    scf = []
    scf_cyc = None
    last_cyc_num = None
    for k, line in enumerate(lines[initial_lineno:]):
        curr_lineno = k + initial_lineno

        if "SCF ENDED" in line or (final_lineno is not None
                                   and curr_lineno == final_lineno):
            # add last scf cycle
            if scf_cyc:
                scf.append(scf_cyc)
            if "CONVERGE" not in line:
                return ParsedSection(curr_lineno, scf, None, line.strip())
            else:
                return ParsedSection(curr_lineno, scf, None)

        line = line.strip()

        if fnmatch(line, "CYC*"):

            # start new cycle
            if scf_cyc is not None:
                scf.append(scf_cyc)
            scf_cyc = {}

            # check we are adding them in sequential order
            cur_cyc_num = split_numbers(line)[0]
            if last_cyc_num is not None:
                if cur_cyc_num != last_cyc_num + 1:
                    return ParsedSection(
                        curr_lineno,
                        scf,
                        "was expecting the SCF cyle number to be {0} in line {1}: {2}"
                        .format(int(last_cyc_num + 1), curr_lineno, line),
                    )
            last_cyc_num = cur_cyc_num

            if fnmatch(line, "*ETOT*"):
                if not fnmatch(line, "*ETOT(AU)*"):
                    raise IOError("was expecting units in a.u. on line {0}, "
                                  "got: {1}".format(curr_lineno, line))
                # this is the initial energy of the configuration and so actually the energy of the previous run
                if scf:
                    scf[-1]["energy"] = scf[-1].get("energy", {})
                    scf[-1]["energy"]["total"] = convert_units(
                        split_numbers(line)[1], "hartree", "eV")

        elif scf_cyc is None:
            continue

        # The total magnetization is the integral of the magnetization in the cell:
        #     MT=∫ (nup-ndown) d3 r
        #
        # The absolute magnetization is the integral of the absolute value of the magnetization in the cell:
        #     MA=∫ |nup-ndown| d3 r
        #
        # In a simple ferromagnetic material they should be equal (except possibly for an overall sign).
        # In simple antiferromagnets (like FeO) MT is zero and MA is twice the magnetization of each of the two atoms.

        if line.startswith("CHARGE NORMALIZATION FACTOR"):
            scf_cyc["CHARGE NORMALIZATION FACTOR".lower().replace(
                " ", "_")] = split_numbers(line)[0]
        if line.startswith("SUMMED SPIN DENSITY"):
            scf_cyc["spin_density_total"] = split_numbers(line)[0]

        if line.startswith("TOTAL ATOMIC CHARGES"):
            scf_cyc["atomic_charges_peratom"] = []
            j = curr_lineno + 1
            while len(lines[j].strip().split()) == len(split_numbers(
                    lines[j])):
                scf_cyc["atomic_charges_peratom"] += split_numbers(lines[j])
                j += 1
        if line.startswith("TOTAL ATOMIC SPINS"):
            scf_cyc["spin_density_peratom"] = []
            j = curr_lineno + 1
            while len(lines[j].strip().split()) == len(split_numbers(
                    lines[j])):
                scf_cyc["spin_density_peratom"] += split_numbers(lines[j])
                j += 1
            scf_cyc["spin_density_absolute"] = sum(
                [abs(s) for s in split_numbers(lines[curr_lineno + 1])])

    # add last scf cycle
    if scf_cyc:
        scf.append(scf_cyc)

    return ParsedSection(
        curr_lineno,
        scf,
        "Did not find end of SCF section (starting on line {})".format(
            initial_lineno),
    )
def parse_geometry_section(data, initial_lineno, line, lines):
    """Parse a section of geometry related variables.

    Parameters
    ----------
    data: dict
        existing data to add the geometry data to
    initial_lineno: int
    line: str
        the current line
    lines: list[str]

    Notes
    -----

    For initial and 'FINAL OPTIMIZED GEOMETRY' only::

        DIRECT LATTICE VECTORS CARTESIAN COMPONENTS (ANGSTROM)
                X                    Y                    Z
        0.355114561000E+01   0.000000000000E+00   0.000000000000E+00
        0.000000000000E+00   0.355114561000E+01   0.000000000000E+00
        0.000000000000E+00   0.000000000000E+00   0.535521437000E+01


        CARTESIAN COORDINATES - PRIMITIVE CELL
        *******************************************************************************
        *      ATOM          X(ANGSTROM)         Y(ANGSTROM)         Z(ANGSTROM)
        *******************************************************************************
            1    26 FE    0.000000000000E+00  0.000000000000E+00  0.000000000000E+00
            2    26 FE    1.775572805000E+00  1.775572805000E+00  0.000000000000E+00
            3    16 S    -1.110223024625E-16  1.775572805000E+00  1.393426779074E+00
            4    16 S     1.775572805000E+00  7.885127240037E-16 -1.393426779074E+00

    For initial, final and optimisation steps:

    Primitive cell::

        PRIMITIVE CELL - CENTRING CODE 1/0 VOLUME=    36.099581 - DENSITY  6.801 g/cm^3
                A              B              C           ALPHA      BETA       GAMMA
            2.94439264     2.94439264     4.16400000    90.000000  90.000000  90.000000
        *******************************************************************************
        ATOMS IN THE ASYMMETRIC UNIT    4 - ATOMS IN THE UNIT CELL:    4
            ATOM                 X/A                 Y/B                 Z/C
        *******************************************************************************
            1 T  28 NI    0.000000000000E+00  0.000000000000E+00  0.000000000000E+00

    Crystallographic cell (only if the geometry is not originally primitive)::

        CRYSTALLOGRAPHIC CELL (VOLUME=         74.61846100)
                A              B              C           ALPHA      BETA       GAMMA
            4.21000000     4.21000000     4.21000000    90.000000  90.000000  90.000000

        COORDINATES IN THE CRYSTALLOGRAPHIC CELL
            ATOM                 X/A                 Y/B                 Z/C
        *******************************************************************************
            1 T  12 MG    0.000000000000E+00  0.000000000000E+00  0.000000000000E+00

    """

    # check that units are correct (probably not needed)
    if fnmatch(line, "LATTICE PARAMETERS*(*)"):
        if not ("ANGSTROM" in line and "DEGREES" in line):
            raise IOError(
                "was expecting lattice parameters in angstroms and degrees on line:"
                " {0}, got: {1}".format(initial_lineno, line))
        return

    for pattern, field, pattern2 in [
        ("PRIMITIVE*CELL*", "primitive_cell", "ATOMS IN THE ASYMMETRIC UNIT*"),
        (
            "CRYSTALLOGRAPHIC*CELL*",
            "crystallographic_cell",
            "COORDINATES IN THE CRYSTALLOGRAPHIC CELL",
        ),
    ]:
        if fnmatch(line, pattern):
            if not fnmatch(lines[initial_lineno + 1].strip(),
                           "A*B*C*ALPHA*BETA*GAMMA"):
                raise IOError("was expecting A B C ALPHA BETA GAMMA on line:"
                              " {0}, got: {1}".format(
                                  initial_lineno + 1,
                                  lines[initial_lineno + 1]))
            data[field] = edict.merge([
                data.get(field, {}),
                {
                    "cell_parameters":
                    dict(
                        zip(
                            ["a", "b", "c", "alpha", "beta", "gamma"],
                            split_numbers(lines[initial_lineno + 2]),
                        ))
                },
            ])
        elif fnmatch(line, pattern2):
            periodic = [True, True, True]
            if not fnmatch(lines[initial_lineno + 1].strip(),
                           "ATOM*X/A*Y/B*Z/C"):
                # for 2d (slab) can get z in angstrom (and similar for 1d)
                if fnmatch(lines[initial_lineno + 1].strip(),
                           "ATOM*X/A*Y/B*Z(ANGSTROM)*"):
                    periodic = [True, True, False]
                elif fnmatch(
                        lines[initial_lineno + 1].strip(),
                        "ATOM*X/A*Y(ANGSTROM)*Z(ANGSTROM)*",
                ):
                    periodic = [True, False, False]
                elif fnmatch(
                        lines[initial_lineno + 1].strip(),
                        "ATOM*X(ANGSTROM)*Y(ANGSTROM)*Z(ANGSTROM)*",
                ):
                    periodic = [False, False, False]
                    cell_params = dict(
                        zip(
                            ["a", "b", "c", "alpha", "beta", "gamma"],
                            [500.0, 500.0, 500.0, 90.0, 90.0, 90.0],
                        ))
                    data[field] = edict.merge([
                        data.get(field, {}), {
                            "cell_parameters": cell_params
                        }
                    ])
                else:
                    raise IOError(
                        "was expecting ATOM X Y Z (in units of ANGSTROM or fractional) on line:"
                        " {0}, got: {1}".format(initial_lineno + 1,
                                                lines[initial_lineno + 1]))
            if not all(periodic) and "cell_parameters" not in data.get(
                    field, {}):
                raise IOError(
                    "require cell parameters to have been set for non-periodic directions in line"
                    " #{0} : {1}".format(initial_lineno + 1,
                                         lines[initial_lineno + 1]))
            a, b, c, alpha, beta, gamma = [None] * 6
            if not all(periodic):
                cell = data[field]["cell_parameters"]
                a, b, c, alpha, beta, gamma = [
                    cell[p] for p in ["a", "b", "c", "alpha", "beta", "gamma"]
                ]

            curr_lineno = initial_lineno + 3
            atom_data = {
                "ids": [],
                "assymetric": [],
                "atomic_numbers": [],
                "symbols": [],
            }
            atom_data["pbc"] = periodic
            while (lines[curr_lineno].strip()
                   and not lines[curr_lineno].strip()[0].isalpha()):
                fields = lines[curr_lineno].strip().split()
                atom_data["ids"].append(fields[0])
                atom_data["assymetric"].append(bool(strtobool(fields[1])))
                atom_data["atomic_numbers"].append(int(fields[2]))
                atom_data["symbols"].append(fields[3].lower().capitalize())
                if all(periodic):
                    atom_data.setdefault("fcoords", []).append(
                        [float(fields[4]),
                         float(fields[5]),
                         float(fields[6])])
                elif periodic == [True, True, False
                                  ] and alpha == 90 and beta == 90:
                    atom_data.setdefault("fcoords", []).append([
                        float(fields[4]),
                        float(fields[5]),
                        float(fields[6]) / c
                    ])
                elif periodic == [False, False, False]:
                    atom_data.setdefault("ccoords", []).append(
                        [float(fields[4]),
                         float(fields[5]),
                         float(fields[6])])

                # TODO other periodic types (1D)
                curr_lineno += 1

            data[field] = edict.merge([data.get(field, {}), atom_data])

    # TODO These coordinates are present in initial and final optimized sections,
    # but DON'T work with lattice parameters
    if fnmatch(line, "CARTESIAN COORDINATES - PRIMITIVE CELL*"):
        if not fnmatch(
                lines[initial_lineno + 2].strip(),
                "*ATOM*X(ANGSTROM)*Y(ANGSTROM)*Z(ANGSTROM)",
        ):
            raise IOError(
                "was expecting ATOM X(ANGSTROM) Y(ANGSTROM) Z(ANGSTROM) on line:"
                " {0}, got: {1}".format(initial_lineno + 2,
                                        lines[initial_lineno + 2]))

        curr_lineno = initial_lineno + 4
        atom_data = {
            "ids": [],
            "atomic_numbers": [],
            "symbols": [],
            "ccoords": []
        }
        while (lines[curr_lineno].strip()
               and not lines[curr_lineno].strip()[0].isalpha()):
            fields = lines[curr_lineno].strip().split()
            if len(fields) < 6:
                raise IOError("was expecting ID ANUM SYMBOL X Y Z on line:"
                              " {0}, got: {1}".format(curr_lineno,
                                                      lines[curr_lineno]))
            atom_data["ids"].append(fields[0])
            atom_data["atomic_numbers"].append(int(fields[1]))
            atom_data["symbols"].append(fields[2].lower().capitalize())
            atom_data["ccoords"].append(
                [float(fields[3]),
                 float(fields[4]),
                 float(fields[5])])
            curr_lineno += 1
        data["primitive_cell"] = edict.merge(
            [data.get("primitive_cell", {}), atom_data])

    elif fnmatch(line, "DIRECT LATTICE VECTORS CARTESIAN COMPONENTS*"):
        if "ANGSTROM" not in line:
            raise IOError("was expecting lattice vectors in angstroms on line:"
                          " {0}, got: {1}".format(initial_lineno, line))
        if not fnmatch(lines[initial_lineno + 1].strip(), "X*Y*Z"):
            raise IOError("was expecting X Y Z on line:"
                          " {0}, got: {1}".format(initial_lineno + 1,
                                                  lines[initial_lineno + 1]))
        if "crystallographic_cell" not in data:
            data["crystallographic_cell"] = {}
        if "cell_vectors" in data["crystallographic_cell"]:
            raise IOError("found multiple cell vectors on line:"
                          " {0}, got: {1}".format(initial_lineno + 1,
                                                  lines[initial_lineno + 1]))
        vectors = {
            "a": split_numbers(lines[initial_lineno + 2]),
            "b": split_numbers(lines[initial_lineno + 3]),
            "c": split_numbers(lines[initial_lineno + 4]),
        }

        data["primitive_cell"]["cell_vectors"] = vectors
def initial_parse(lines):
    """Scan the file for errors, and find the final elapsed time value."""
    errors = []
    warnings = []
    parser_errors = []
    mpi_abort = False
    telapse_line = None
    start_lines = {}

    second_opt_line = False
    # This is required since output looks like
    # OPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPT

    # STARTING GEOMETRY OPTIMIZATION - INFORMATION ON SCF MOVED TO SCFOUT.LOG
    # GEOMETRY OPTIMIZATION INFORMATION STORED IN OPTINFO.DAT

    # OPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPTOPT

    for lineno, line in enumerate(lines):

        if "WARNING" in line.upper():
            warnings.append(line.strip())
        elif "ERROR" in line:
            # TODO ignore errors before program execution (e.g. in mpiexec setup)?
            if "open_hca: getaddr_netdev ERROR" not in line:
                errors.append(line.strip())
        elif "SCF abnormal end" in line:  # only present when run using runcry
            errors.append(line.strip())
        elif "MPI_Abort" in line:
            # only record one mpi_abort event (to not clutter output)
            if not mpi_abort:
                errors.append(line.strip())
                mpi_abort = True
        elif "CONVERGENCE TESTS UNSATISFIED" in line.upper():
            errors.append(line.strip())
        elif "TELAPSE" in line:
            telapse_line = lineno

        # search for an optimisation
        elif "OPTOPTOPTOPT" in line:
            if "optimization" in start_lines:
                if second_opt_line:
                    parser_errors.append(
                        "found two lines starting  optimization section: "
                        "{0} and {1}".format(start_lines["optimization"],
                                             lineno))
                else:
                    second_opt_line = True
            start_lines["optimization"] = lineno
        elif ("CONVERGENCE ON GRADIENTS SATISFIED AFTER THE FIRST OPTIMIZATION CYCLE"
              in line):
            if "optimization" in start_lines:
                if second_opt_line:
                    parser_errors.append(
                        "found two lines starting optimization section: "
                        "{0} and {1}".format(start_lines["optimization"],
                                             lineno))
                else:
                    second_opt_line = True
            start_lines["optimization"] = lineno

        # search for mulliken analysis
        elif line.strip().startswith("MULLIKEN POPULATION ANALYSIS"):
            # can have ALPHA+BETA ELECTRONS and ALPHA-BETA ELECTRONS (denoted in line above mulliken_starts)
            start_lines.setdefault("mulliken", []).append(lineno)

        # search for final geometry
        elif "FINAL OPTIMIZED GEOMETRY" in line:
            if "final_geometry" in start_lines:
                parser_errors.append(
                    "found two lines starting 'FINAL OPTIMIZED GEOMETRY':"
                    " {0} and {1}".format(start_lines["final_geometry"],
                                          lineno))
            start_lines["final_geometry"] = lineno

    total_seconds = None
    if telapse_line:
        total_seconds = int(
            split_numbers(lines[telapse_line].split("TELAPSE")[1])[0])
        # m, s = divmod(total_seconds, 60)
        # h, m = divmod(m, 60)
        # elapsed_time = "%d:%02d:%02d" % (h, m, s)

    return errors, warnings, parser_errors, total_seconds, start_lines
def parse_mulliken_analysis(lines, mulliken_indices):
    """

    Parameters
    ----------
    lines: list[str]
    mulliken_indices: list[int]

    Returns
    -------
    ParsedSection

    """
    mulliken = {}

    for i, indx in enumerate(mulliken_indices):
        name = lines[indx - 1].strip().lower()
        key_name = name.replace(" ", "_")
        if not (name == "ALPHA+BETA ELECTRONS".lower()
                or name == "ALPHA-BETA ELECTRONS".lower()):
            return ParsedSection(
                mulliken_indices[0],
                mulliken,
                "was expecting mulliken to be alpha+beta or alpha-beta on line:"
                " {0}, got: {1}".format(indx - 1, lines[indx - 1]),
            )

        if len(mulliken_indices) > i + 1:
            searchlines = lines[indx + 1:mulliken_indices[i + 1]]
        else:
            searchlines = lines[indx + 1:]

        data_ao = {}
        data_shell = {}

        for j, line in enumerate(searchlines):
            if fnmatch(line.strip(), "*ATOM*Z*CHARGE*A.O.*POPULATION*"):
                charge_line = j + 2

                while (searchlines[charge_line].strip()
                       and not searchlines[charge_line].strip()[0].isalpha()):
                    fields = searchlines[charge_line].strip().split()
                    # a.o. population can wrap multiple lines
                    if len(fields) != len(
                            split_numbers(searchlines[charge_line])):
                        data_ao.setdefault("ids", []).append(int(fields[0]))
                        data_ao.setdefault("symbols", []).append(
                            fields[1].lower().capitalize())
                        data_ao.setdefault("atomic_numbers",
                                           []).append(int(fields[2]))
                        data_ao.setdefault("charges",
                                           []).append(float(fields[3]))
                        data_ao.setdefault("aos", []).append(
                            [float(f) for f in fields[4:]])
                    else:
                        data_ao["aos"][-1].extend(
                            split_numbers(searchlines[charge_line]))

                    charge_line += 1

            elif fnmatch(line.strip(), "*ATOM*Z*CHARGE*SHELL*POPULATION*"):
                charge_line = j + 2

                while (searchlines[charge_line].strip()
                       and not searchlines[charge_line].strip()[0].isalpha()):
                    fields = searchlines[charge_line].strip().split()
                    # shell population can wrap multiple lines
                    if len(fields) != len(
                            split_numbers(searchlines[charge_line])):
                        data_shell.setdefault("ids", []).append(int(fields[0]))
                        data_shell.setdefault("symbols", []).append(
                            fields[1].lower().capitalize())
                        data_shell.setdefault("atomic_numbers",
                                              []).append(int(fields[2]))
                        data_shell.setdefault("charges",
                                              []).append(float(fields[3]))
                        data_shell.setdefault("shells", []).append(
                            [float(f) for f in fields[4:]])
                    else:
                        data_shell["shells"][-1].extend(
                            split_numbers(searchlines[charge_line]))

                    charge_line += 1

        # TODO check consistency of ids, ...
        data_ao.update(data_shell)

        mulliken[key_name] = data_ao

    return ParsedSection(mulliken_indices[0], mulliken)
def parse_optimisation(lines, initial_lineno):
    """read geometric optimisation

    Parameters
    ----------
    lines: list[str]
    initial_lineno: int

    Returns
    -------
    ParsedSection

    """
    if ("CONVERGENCE ON GRADIENTS SATISFIED AFTER THE FIRST OPTIMIZATION CYCLE"
            in lines[initial_lineno]):
        for k, line in enumerate(lines[initial_lineno:]):
            curr_lineno = initial_lineno + k
            line = line.strip()

            if "OPT END -" in line:

                if not fnmatch(line, "*E(AU)*"):
                    raise IOError("was expecting units in a.u. on line:"
                                  " {0}, got: {1}".format(curr_lineno, line))
                data = [{
                    "energy": {
                        "total_corrected":
                        convert_units(split_numbers(line)[0], "hartree", "eV")
                    }
                }]

                return ParsedSection(curr_lineno, data)

        return ParsedSection(
            curr_lineno,
            [],
            "did not find 'OPT END', after optimisation start at line {}".
            format(initial_lineno),
        )

    opt_cycles = []
    opt_cyc = None
    scf_start_no = None
    failed_opt_step = False

    for k, line in enumerate(lines[initial_lineno:]):
        curr_lineno = initial_lineno + k
        line = line.strip()

        if "OPT END -" in line:
            if opt_cyc and not failed_opt_step:
                opt_cycles.append(opt_cyc)
            return ParsedSection(curr_lineno, opt_cycles)

        if fnmatch(line, "*OPTIMIZATION*POINT*"):
            if opt_cyc is not None and not failed_opt_step:
                opt_cycles.append(opt_cyc)
            opt_cyc = {}
            scf_start_no = None
            failed_opt_step = False
        elif opt_cyc is None:
            continue

        # when using ONELOG optimisation key word
        if "CRYSTAL - SCF - TYPE OF CALCULATION :" in line:
            if scf_start_no is not None:
                return ParsedSection(
                    curr_lineno,
                    opt_cycles,
                    "found two lines starting scf ('CRYSTAL - SCF - ') in opt step {0}:"
                    .format(len(opt_cycles)) +
                    " {0} and {1}".format(scf_start_no, curr_lineno),
                )
            scf_start_no = curr_lineno
        elif "SCF ENDED" in line:
            if "CONVERGE" not in line:
                pass  # errors.append(line.strip())
            outcome = parse_scf_section(lines, scf_start_no + 1,
                                        curr_lineno + 1)
            # TODO test if error
            opt_cyc["scf"] = outcome.data

        parse_geometry_section(opt_cyc, curr_lineno, line, lines)

        # TODO move to read_post_scf?
        if fnmatch(line, "TOTAL ENERGY*DE*"):
            if not fnmatch(line, "TOTAL ENERGY*AU*DE*AU*"):
                return ParsedSection(
                    curr_lineno,
                    opt_cycles,
                    "was expecting units in a.u. on line:"
                    " {0}, got: {1}".format(curr_lineno, line),
                )
            opt_cyc["energy"] = opt_cyc.get("energy", {})
            opt_cyc["energy"]["total_corrected"] = convert_units(
                split_numbers(line)[1], "hartree", "eV")

        for param in [
                "MAX GRADIENT", "RMS GRADIENT", "MAX DISPLAC", "RMS DISPLAC"
        ]:
            if fnmatch(line, "{}*CONVERGED*".format(param)):
                if "convergence" not in opt_cyc:
                    opt_cyc["convergence"] = {}
                opt_cyc["convergence"][param.lower().replace(" ", "_")] = bool(
                    strtobool(line.split()[-1]))

        if fnmatch(line,
                   "*SCF DID NOT CONVERGE. RETRYING WITH A SMALLER OPT STEP*"):
            # TODO add failed optimisation steps with dummy energy and extra parameter?
            # for now discard this optimisation step
            failed_opt_step = True

    if opt_cyc and not failed_opt_step:
        opt_cycles.append(opt_cyc)

    return ParsedSection(
        curr_lineno,
        opt_cycles,
        "did not find 'OPT END', after optimisation start at line {}".format(
            initial_lineno),
    )