Ejemplos de get_root_source en Python, ejemplos de matador.utils.chem_utils.get_root_source en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_chem_utils.py Proyecto: ml-evs/matador

    def test_root_source(self):
        source = ["KP.cell", "KP.param", "KP.castep"]
        src = "KP"
        self.assertEqual(src, get_root_source(source))

        source = ["KP.cell", "KP.param", "KP-1234-abcd.castep"]
        src = "KP-1234-abcd"
        self.assertEqual(src, get_root_source(source))

        source = [
            "KP.cell",
            "KP.param",
            "abcd-123.fdasf/efgf/KP-0.02.-1234-abcd.castep",
            "KP-0.02.-1234-abcd.res",
        ]
        src = "KP-0.02.-1234-abcd"
        self.assertEqual(src, get_root_source(source))

        source = [
            "KP.cell",
            "KP.param",
            "abcd-123.fdasf/efgf/KP-0.02.-1234-abcd.history",
        ]
        src = "KP-0.02.-1234-abcd"
        self.assertEqual(src, get_root_source(source))

        source = ["KP.cell", "KP.param", "PK-OQMD_12345.history"]
        src = "PK-OQMD_12345"
        self.assertEqual(src, get_root_source(source))

        source = ["OQMD 12345"]
        src = "OQMD 12345"
        self.assertEqual(src, get_root_source(source))

        source = ["OQMD-12345"]
        src = "OQMD-12345"
        self.assertEqual(src, get_root_source(source))

        source = [
            "KP.cell",
            "KP.param",
            "abcd-123.fdasf/efgf/KP-0.02.-1234-abcd.castep",
            "KP-1234-abcde.res",
        ]
        with self.assertRaises(RuntimeError):
            src = get_root_source(source)

        source = ["not a file name"]
        src = "not a file name"
        self.assertEqual(src, get_root_source(source))

        source = "not even a list"
        src = "not even a list"
        self.assertEqual(src, get_root_source(source))

Ejemplo n.º 2

0

Mostrar archivo

    def root_source(self):
        from matador.utils.chem_utils import get_root_source
        try:
            if 'source' in self._data:
                self._root_source = get_root_source(self._data['source'])
        except RuntimeError:
            pass

        return self._root_source

Ejemplo n.º 3

0

Mostrar archivo

    def add_root_source(self):
        """ Add the "root_source" key to a document in the database,
        i.e. the name of the structure, minus file extension.

        """
        from matador.utils.chem_utils import get_root_source
        for _, doc in enumerate(self.cursor):
            try:
                if 'root_source' in doc:
                    continue
                else:
                    doc['root_source'] = get_root_source(doc['source'])
                    self.diff_cursor.append(doc)
                    self.changed_count += 1
            except Exception as error:
                print(repr(error))
                self.failed_count += 1

Ejemplo n.º 4

0

Mostrar archivo

Archivo: hull_ensemble.py Proyecto: ml-evs/matador

    def generate_stability_statistics(self, group_by='structure'):
        """ Creates a histogram that counts how many times each structure
        is found to be stable in the ensemble.

        Keyword arguments:
            group_by (str): either 'structure' or 'formula' for bar groupings.

        """
        from collections import defaultdict
        histogram = defaultdict(int)
        for pd in self.phase_diagrams:
            for doc in pd.stable_structures:
                if group_by == 'formula':
                    histogram[get_formula_from_stoich(
                        doc['stoichiometry'])] += 1
                else:
                    histogram[get_root_source(doc)] += 1
        return histogram

Ejemplo n.º 5

0

Mostrar archivo

Archivo: export.py Proyecto: ml-evs/matador

def query2files(cursor,
                dirname=None,
                max_files=10000,
                top=None,
                prefix=None,
                cell=None,
                param=None,
                res=None,
                pdb=None,
                json=None,
                xsf=None,
                markdown=True,
                latex=False,
                subcmd=None,
                argstr=None,
                **kwargs):
    """ Many-to-many convenience function for many structures being written to
    many file types.

    Parameters:
        cursor (:obj:`list` of :obj:`dict`/:class:`AtomicSwapper`): list of matador dictionaries to write out.

    Keyword arguments:
        dirname (str): the folder to save the results into. Will be created if non-existent.
            Will have integer appended to it if already existing.
        max_files (int): if the number of files to be written exceeds this number, then raise RuntimeError.
        **kwargs (dict): dictionary of {filetype: bool(whether to write)}. Accepted file types
            are cell, param, res, pdb, json, xsf, markdown and latex.

    """
    multiple_files = any((cell, param, res, pdb, xsf))
    prefix = prefix + '-' if prefix is not None else ''

    if isinstance(cursor, AtomicSwapper):
        cursor = cursor.cursor
        subcmd = "swaps"

    if subcmd in ['polish', 'swaps']:
        info = False
        hash_dupe = False
    else:
        info = True
        hash_dupe = False

    if isinstance(cursor, list):
        num = len(cursor)
    else:
        num = cursor.count()

    if top is not None:
        if top < num:
            num = top

    num_files = num * sum(1 for ext in [cell, param, res, pdb, xsf] if ext)

    if multiple_files:
        print('Intending to write', num, 'structures to file...')
        if num_files > max_files:
            raise RuntimeError(
                "Not writing {} files as it exceeds argument `max_files` limit of {}"
                .format(num_files, max_files))

    if dirname is None:
        dirname = generate_relevant_path(subcmd=subcmd, **kwargs)

    _dir = False
    dir_counter = 0
    # postfix integer on end of directory name if it exists
    while not _dir:
        if dir_counter != 0:
            directory = dirname + str(dir_counter)
        else:
            directory = dirname
        if not os.path.isdir(directory):
            os.makedirs(directory)
            _dir = True
        else:
            dir_counter += 1

    for _, doc in enumerate(cursor[:num]):
        # generate an appropriate filename for the structure
        root_source = get_root_source(doc)

        if '_swapped_stoichiometry' in doc:
            formula = get_formula_from_stoich(doc['_swapped_stoichiometry'])
        else:
            formula = get_formula_from_stoich(doc['stoichiometry'])

        if subcmd == 'swaps':
            root_source = root_source.replace('-swap-', '-')

        name = root_source

        if 'OQMD ' in root_source:
            name = '{formula}-OQMD_{src}'.format(
                formula=formula, src=root_source.split(' ')[-1])
        elif 'mp-' in root_source:
            name = '{formula}-MP_{src}'.format(formula=formula,
                                               src=root_source.split('-')[-1])
        if 'icsd' in doc and 'CollCode' not in name:
            name += '-CollCode{}'.format(doc['icsd'])
        else:
            pf_id = None
            for source in doc['source']:
                if 'pf-' in source:
                    pf_id = source.split('-')[-1]
                    break
            else:
                if 'pf_ids' in doc:
                    pf_id = doc['pf_ids'][0]
            if pf_id is not None:
                name += '-PF-{}'.format(pf_id)

        # if swaps, prepend new composition
        if subcmd == 'swaps':
            new_formula = get_formula_from_stoich(get_stoich(
                doc['atom_types']))
            name = '{}-swap-{}'.format(new_formula, name)

        path = "{directory}/{prefix}{name}".format(directory=directory,
                                                   prefix=prefix,
                                                   name=name)

        if param:
            doc2param(doc, path, hash_dupe=hash_dupe)
        if cell:
            doc2cell(doc, path, hash_dupe=hash_dupe)
        if res:
            doc2res(doc, path, info=info, hash_dupe=hash_dupe)
        if json:
            doc2json(doc, path, hash_dupe=hash_dupe)
        if pdb:
            doc2pdb(doc, path, hash_dupe=hash_dupe)
        if xsf:
            doc2xsf(doc, path)

    hull = subcmd in ['hull', 'voltage']
    if isinstance(cursor, pm.cursor.Cursor):
        cursor.rewind()
    md_path = "{directory}/{directory}.md".format(directory=directory)
    md_kwargs = {}
    md_kwargs.update(kwargs)
    md_kwargs.update({
        'markdown': True,
        'latex': False,
        'argstr': argstr,
        'hull': hull
    })
    md_string = display_results(cursor, **md_kwargs)
    with open(md_path, 'w') as f:
        f.write(md_string)

    if latex:
        if isinstance(cursor, pm.cursor.Cursor):
            cursor.rewind()
        tex_path = "{directory}/{directory}.tex".format(directory=directory)
        print('Writing LaTeX file', tex_path + '...')
        tex_kwargs = {}
        tex_kwargs.update(kwargs)
        tex_kwargs.update({
            'latex': True,
            'markdown': False,
            'argstr': argstr,
            'hull': hull
        })
        tex_string = display_results(cursor, **tex_kwargs)
        with open(tex_path, 'w') as f:
            f.write(tex_string)

    print('Done!')

Ejemplo n.º 6

0

Mostrar archivo

def _construct_structure_string(
        doc, ind, formula_substring, gs_enthalpy, use_source, colour, hull, additions, deletions,
        add_index_mode, del_index_mode, energy_key, per_atom, eform, markdown, latex
):
    """ Construct the pretty output for an individual structure.

    Options passed from `matador.utils.cursor_utils.display_results.`

    Returns:
        str: the pretty output.

    """

    # start with two spaces, replaced by the prefix from hull/add/del
    this_struct_string = '  '
    prefix = ''
    suffix = ''
    # apply appropriate prefices and suffices to structure
    if hull and np.abs(doc.get('hull_distance')) <= 0.0 + 1e-12:
        if colour:
            prefix = '\033[92m'
            suffix = '\033[0m'
        this_struct_string = '* '

    if additions is not None:
        if (add_index_mode and ind in additions) or doc.get('text_id', '_') in additions:
            this_struct_string = '+ '
            if colour:
                prefix = '\033[92m'
                suffix = '\033[0m'
    if deletions is not None:
        if (del_index_mode and ind in deletions) or doc.get('text_id', '_') in deletions:
            this_struct_string = '- '
            if colour:
                prefix = '\033[91m'
                suffix = '\033[0m'

    # display the canonical name for the structure
    if use_source:
        src = get_root_source(doc['source'])
        max_len = 34
        this_struct_string += "{:<36.{max_len}}".format(
            src if len(src) <= max_len else src[:max_len-4]+'[..]', max_len=max_len
        )
    else:
        this_struct_string += "{:^24.22}".format(' '.join(doc.get('text_id', ['xxx', 'yyy'])))

    # again, if we're not outputting to markdown, then flag warnings in the quality column
    try:
        if doc.get('prototype'):
            this_struct_string += "{:^5}".format('*p*')
        elif doc.get('quality', 5) == 0:
            this_struct_string += "{:^5}".format('!!!')
        else:
            this_struct_string += "{:^5}".format((5 - doc.get('quality', 5)) * '?')
    except KeyError:
        this_struct_string += "{:^5}".format(' ')

    # loop over header names and print the appropriate values
    if 'pressure' in doc and doc['pressure'] != 'xxx':
        this_struct_string += "{: >9.2f} ".format(doc['pressure'])
    else:
        this_struct_string += "{:^9} ".format('xxx')

    try:
        if per_atom and 'cell_volume' in doc and 'num_atoms' in doc:
            this_struct_string += "{:>12.1f}  ".format(doc['cell_volume'] / doc['num_atoms'])
        elif 'cell_volume' in doc and 'num_fu' in doc:
            this_struct_string += "{:>12.1f}  ".format(doc['cell_volume'] / doc['num_fu'])
        else:
            this_struct_string += "{:^12}  ".format('xxx')
    except Exception:
        this_struct_string += "{:^10} ".format('xxx')

    try:
        if hull and eform:
            this_struct_string += "{:>12.3f}      ".format(
                doc['formation_' + energy_key]
            )
        elif hull:
            this_struct_string += "{:>12.1f}      ".format(
                1000 * doc['hull_distance']
            )
        elif per_atom:
            this_struct_string += "{:>16.4f}  ".format(recursive_get(doc, energy_key) - gs_enthalpy)
        else:
            this_struct_string += "{:>16.4f}  ".format(
                recursive_get(doc, energy_key) * doc['num_atoms'] / doc['num_fu'] - gs_enthalpy
            )
    except KeyError:
        this_struct_string += "{:^18}".format('xxx')

    if latex:
        from matador.utils.cell_utils import get_space_group_label_latex
        this_struct_string += " {:^13} ".format(get_space_group_label_latex(doc.get('space_group', 'xxx')))
    else:
        this_struct_string += " {:^13} ".format(doc.get('space_group', 'xxx'))

    # now we add the formula column
    this_struct_string += " {:^13} ".format(formula_substring)

    if 'num_fu' in doc:
        this_struct_string += " {:^6} ".format(int(doc['num_fu']))
    else:
        this_struct_string += " {:^6} ".format('xxx')

    if 'source' in doc:
        prov = get_guess_doc_provenance(doc['source'], doc.get('icsd'))
        this_struct_string += "{:^8}".format(prov)
    else:
        this_struct_string += "{:^8}".format('xxx')

    this_struct_string = prefix + this_struct_string + suffix

    return this_struct_string

Ejemplo n.º 7

0

Mostrar archivo

Archivo: adapt.py Proyecto: ml-evs/ilustrado

def adapt(
    possible_parents,
    mutation_rate,
    crossover_rate,
    mutations=None,
    max_num_mutations=3,
    max_num_atoms=40,
    structure_filter=None,
    minsep_dict=None,
    debug=False,
):
    """ Take a list of possible parents and randomly adapt
    according to given mutation weightings.

    Parameters:
        possible_parents (list(dict)) : list of all breeding stock,
        mutation_rate (float): rate of mutations relative to crossover,
        crossover_rate (float): see `mutation_rate`.

    Keyword Arguments:
        mutations (list(str)): list of desired mutations to choose from (as strings),
        max_num_mutations (int): rand(1, this) mutations will be performed,
        max_num_atoms (int): any structures with more than this many atoms will be filtered out.
        structure_filter (callable(dict)): custom filter to pass to check_feasible.
        minsep_dict (dict): dictionary containing element-specific minimum separations, e.g.
            `{('K', 'K'): 2.5, ('K', 'P'): 2.0}`.

    Returns:
        dict: the mutated/newborn structure.

    """
    total_rate = mutation_rate + crossover_rate
    if total_rate != 1.0:
        LOG.debug(
            "Total mutation rate not 1 ({}), rescaling...".format(total_rate))
    mutation_rate /= total_rate
    crossover_rate /= total_rate
    assert mutation_rate + crossover_rate == 1.0
    mutation_rand_seed = np.random.rand()

    # turn specified mutations string into corresponding functions
    if mutations is not None:
        _mutations = []
        from .mutate import nudge_positions, null_nudge_positions, permute_atoms
        from .mutate import random_strain, vacancy, voronoi_shuffle, transmute_atoms

        for mutation in mutations:
            if mutation == "nudge_positions":
                _mutations.append(nudge_positions)
            elif mutation == "null_nudge_positions":
                _mutations.append(null_nudge_positions)
            elif mutation == "permute_atoms":
                _mutations.append(permute_atoms)
            elif mutation == "random_strain":
                _mutations.append(random_strain)
            elif mutation == "voronoi":
                _mutations.append(voronoi_shuffle)
            elif mutation == "vacancy":
                _mutations.append(vacancy)
            elif mutation == "transmute_atoms":
                _mutations.append(transmute_atoms)
    else:
        _mutations = None

    # loop over *SAME* branch (i.e. crossover vs mutation) until valid cell is produced
    # with max attempts of 1000, at which point it will continue with a terrible cell
    valid_cell = False
    max_restarts = 1000
    num_iter = 0
    while not valid_cell and num_iter < max_restarts:
        # if random number is less than mutant rate, then mutate
        if mutation_rand_seed < mutation_rate:
            parent = strip_useless(np.random.choice(possible_parents),
                                   to_run=True)
            try:
                newborn = mutate(
                    parent,
                    mutations=_mutations,
                    max_num_mutations=max_num_mutations,
                    debug=debug,
                )
                parents = [parent]
                valid_cell = check_feasible(
                    newborn,
                    parents,
                    max_num_atoms,
                    structure_filter=structure_filter,
                    minsep_dict=minsep_dict,
                )
            # this will be raised if the mutation fails for a good reason
            except RuntimeError:
                valid_cell = False
            except Exception as oops:
                if debug:
                    print_exc()
                LOG.warning("Mutation failed with error {}".format(oops))
                valid_cell = False
        # otherwise, do crossover
        else:
            if len(possible_parents) > 2:
                parents = [
                    strip_useless(parent, to_run=True) for parent in
                    np.random.choice(possible_parents, size=2, replace=False)
                ]
            elif len(possible_parents) == 2:
                parents = copy.deepcopy(possible_parents)
            elif len(possible_parents) == 1:
                parents = 2 * [copy.deepcopy(possible_parents[0])]
                LOG.warning(
                    "Only one possible parent: performing self-crossover...")

            try:
                newborn = crossover(parents, debug=debug)
                valid_cell = check_feasible(
                    newborn,
                    parents,
                    max_num_atoms,
                    structure_filter=structure_filter,
                    minsep_dict=minsep_dict,
                )
            except RuntimeError:
                valid_cell = False
            except Exception as oops:
                if debug:
                    print_exc()
                LOG.warning("Crossover failed with error {}".format(oops))
                valid_cell = False
        num_iter += 1

    LOG.debug("Initialised newborn after {} trials".format(num_iter))
    if num_iter == max_restarts:
        LOG.warning(
            "Max restarts reached in mutations, something has gone wrong... "
            "running with possibly unphysical cell")
        newborn = adapt(
            possible_parents,
            mutation_rate,
            crossover_rate,
            mutations=mutations,
            max_num_mutations=max_num_mutations,
            max_num_atoms=max_num_atoms,
            minsep_dict=minsep_dict,
            debug=debug,
        )
    # set parents in newborn dict
    if "parents" not in newborn:
        newborn["parents"] = []
        for parent in parents:
            parent_source = get_root_source(parent["source"])
            newborn["parents"].append(parent_source)
    return newborn

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_integration.py Proyecto: ml-evs/matador

    def test_integration(self):
        """ Test import and query. """
        print("IMPORT CASTEP 1")
        query = import_castep()
        self.assertEqual(len(query.cursor),
                         3,
                         msg="Failed to import structures correctly")

        # run again and hopefully nothing will change, i.e. no duplication
        print("IMPORT CASTEP 2")
        query = import_castep()
        self.assertEqual(len(query.cursor),
                         3,
                         msg="Failed to import structures correctly")

        print("IMPORT CASTEP 3")
        query = import_castep(extra_flags="--recent_only")
        self.assertEqual(len(query.cursor),
                         3,
                         msg="Failed to import structures correctly")

        print("IMPORT RES")
        query_1, query_2 = import_res()
        self.assertEqual(len(query_1.cursor),
                         7,
                         msg="Failed to import res files")
        self.assertEqual(len(query_2.cursor),
                         4,
                         msg="Failed to import res files")
        self.assertEqual(
            query_2.cursor[0]["species_pot"]["K"],
            "2|1.5|9|10|11|30U:40:31(qc=6)",
            msg="Failed to scrape OTF with weird name",
        )
        self.assertEqual(
            query_2.cursor[0]["species_pot"]["Sn"],
            "2|2|2|1.6|9.6|10.8|11.7|50U=-0.395U=+0.25:51U=-0.14U=+0.25",
            msg="Failed to scrape OTF with linebreak",
        )
        self.assertFalse(
            any(["Sb" in doc["species_pot"] for doc in query_2.cursor]),
            msg="pspots over-scraped!",
        )

        print("SWAP")
        output_folder_exists, successes, elem_successes = swaps()
        self.assertTrue(output_folder_exists, msg="No folder created")
        self.assertTrue(all(successes), msg="Failed to even read files")
        self.assertFalse(all(elem_successes), msg="Swaps had wrong elements")

        print("CHANGES")
        query_1, query_2, changes_count = changes()
        self.assertEqual(len(query_1.cursor),
                         3,
                         msg="matador changes did not remove files")
        self.assertEqual(len(query_2.cursor),
                         0,
                         msg="matador changes did not remove files")
        # unclear that mongomock can handle this; just check the queries instead
        # self.assertEqual(changes_count, 1, msg='matador changes did not changelog')

        expected_dir = "query-ci_test"
        if os.path.isdir(expected_dir):
            for _file in glob.glob(expected_dir + "/*"):
                os.remove(_file)
            os.removedirs(expected_dir)

        print("EXPORT")
        export()
        expected_files = [
            "query-ci_test/query-ci_test.md",
            "query-ci_test/query-ci_test.tex",
            "query-ci_test/Na3Zn4-swap-ReOs-OQMD_759599.pdb",
            "query-ci_test/Na3Zn4-swap-ReOs-OQMD_759599.xsf",
            "query-ci_test/Na3Zn4-swap-ReOs-OQMD_759599.json",
            "query-ci_test/Na-edgecase-CollCode10101.pdb",
            "query-ci_test/Na-edgecase-CollCode10101.xsf",
            "query-ci_test/Na-edgecase-CollCode10101.json",
            "query-ci_test/NaP_intermediates.pdb",
            "query-ci_test/NaP_intermediates.xsf",
            "query-ci_test/NaP_intermediates.json",
        ]
        dir_exists = os.path.isdir(expected_dir)
        files_exist = all([os.path.isfile(_file) for _file in expected_files])

        for _file in expected_files:
            if os.path.isfile(_file):
                os.remove(_file)

        if os.path.isdir(expected_dir):
            for _file in glob.glob(expected_dir + "/*"):
                os.remove(_file)
            os.removedirs(expected_dir)

        self.assertTrue(dir_exists, msg="Failed to create output directory")
        self.assertTrue(files_exist, msg="Some files missing from export")

        print("PSEUDOTERNARY HULL")
        query, hull = pseudoternary_hull()
        self.assertTrue(query.args.get("intersection"))
        self.assertTrue(query._non_elemental)
        self.assertTrue(query._create_hull)
        self.assertEqual(len(query.cursor), 7)
        self.assertEqual(len(hull.cursor), 7)
        self.assertEqual(len(hull.hull_cursor), 5)

        expected_dir = "query-LaLiZrO-ci_test"
        if os.path.isdir(expected_dir):
            for _file in glob.glob(expected_dir + "/*"):
                os.remove(_file)
            os.removedirs(expected_dir)

        print("PSEUDOTERNARY HULL 2")
        hull = pseudoternary_hull_no_query()
        self.assertTrue(hull._query.args.get("intersection"))
        self.assertTrue(hull._query._non_elemental)
        self.assertTrue(hull._query._create_hull)
        self.assertEqual(len(query.cursor), 7)
        self.assertEqual(len(hull.cursor), 7)
        self.assertEqual(len(hull.hull_cursor), 5)

        expected_dir = "query-LaLiZrO-ci_test"
        if os.path.isdir(expected_dir):
            for _file in glob.glob(expected_dir + "/*"):
                os.remove(_file)
            os.removedirs(expected_dir)

        print("UNIQ")
        uniq()
        expected_files = [expected_dir + "/cubic-LLZO-CollCode999999.res"]

        dir_exists = os.path.isdir(expected_dir)
        files_exist = all([os.path.isfile(_file) for _file in expected_files])
        correct_num = len(glob.glob(expected_dir + "/*.res"))

        if os.path.isdir(expected_dir):
            for _file in glob.glob(expected_dir + "/*"):
                os.remove(_file)
            os.removedirs(expected_dir)

        self.assertTrue(dir_exists,
                        msg="Failed to create output directory, uniq")
        self.assertTrue(files_exist,
                        msg="Some files missing from export, uniq")
        self.assertTrue(correct_num == len(expected_files),
                        msg="Incorrect filter")

        print("STATS")
        try:
            stats()
        except TypeError:
            print("Unable to test stats module due to mongomock limitations.")
            pass

        print("ID QUERY")
        query = id_query()
        self.assertEqual(len(query.cursor), 0)

        print("REFINE")
        cursor = refine().cursor
        self.assertTrue(all([doc["doi"] == ["10/12345"] for doc in cursor]))
        self.assertTrue(
            all([doc["tags"] == ["integration_test"] for doc in cursor]))
        self.assertTrue(
            all([doc["root_source"] == get_root_source(doc)
                 for doc in cursor]))
        self.assertTrue(all([isinstance(doc["_raw"], list) for doc in cursor]))

Ejemplo n.º 9

0

Mostrar archivo

Archivo: importer.py Proyecto: ml-evs/matador

    def _struct2db(self, struct):
        """ Insert completed Python dictionary into chosen
        database, with generated text_id. Add quality factor
        for any missing data.

        Parameters:
            struct (dict): dictionary containing structure.

        Returns:
            int: 1 if successfully inputted into database, 0 otherwise.

        """
        try:
            plain_text_id = [
                WORDS[random.randint(0, self.num_words - 1)].strip(),
                NOUNS[random.randint(0, self.num_nouns - 1)].strip()
            ]
            struct['text_id'] = plain_text_id
            if 'tags' in self.tag_dict:
                struct['tags'] = self.tag_dict['tags']
            struct['quality'] = 5
            # if any missing info at all, score = 0
            # include elem set for faster querying
            if 'elems' not in struct:
                struct['elems'] = sorted(list(set(struct['atom_types'])))

            # check basic DFT params if we're not in a prototype DB
            if not self.args.get('prototype'):
                failed_checks = []
                if 'species_pot' not in struct:
                    struct['quality'] = 0
                    failed_checks.append('missing all pspots')
                else:
                    specified = []
                    for elem in struct['stoichiometry']:
                        # remove all points for a missing pseudo
                        if elem[0] not in struct['species_pot']:
                            struct['quality'] = 0
                            failed_checks.append('missing pspot for {}'.format(
                                elem[0]))
                        else:
                            specified.append(elem[0])
                            # remove a point for a generic OTF pspot
                            if 'OTF' in struct['species_pot'][elem[0]].upper():
                                struct['quality'] -= 1
                                failed_checks.append(
                                    'pspot not fully specified for {}'.format(
                                        elem[0]))
                    struct['species_pot'] = {
                        species: struct['species_pot'][species]
                        for species in specified
                    }

                if 'xc_functional' not in struct:
                    struct['quality'] = 0
                    failed_checks.append('missing xc functional')

                if 'cut_off_energy' not in struct:
                    struct['quality'] -= 1
                if 'kpoints_mp_spacing' not in struct:
                    struct['quality'] -= 1

            else:
                struct['prototype'] = True
                struct['xc_functional'] = 'xxx'
                struct['enthalpy_per_atom'] = 0
                struct['enthalpy'] = 0
                struct['pressure'] = 0
                struct['species_pot'] = {}

            root_src = get_root_source(struct)
            struct['root_source'] = root_src
            exts = ['.castep', '.res', '.history', '.history.gz']
            for ext in exts:
                for src in struct['source']:
                    if src.endswith(ext):
                        expanded_root_src = src

            if struct['quality'] == 5:
                struct_id = self.repo.insert_one(struct).inserted_id
                self.struct_list.append((struct_id, root_src))
                self.manifest.write('+ {}\n'.format(expanded_root_src))
                if self.debug:
                    print('Inserted', struct_id)
            else:
                self.logfile.write('? {} failed quality checks: {}\n'.format(
                    expanded_root_src, failed_checks))
                if self.debug:
                    print('Error with', root_src)
                return 0

        except Exception as exc:
            # this shouldn't fail, but if it does, fail loudly but cleanly
            self.logfile.write(
                '! Importer produced an unexpected error {}. Final state of struct: {}\n'
                .format(exc, struct))
            tb.print_exc()
            return 0

        return 1