Ejemplo n.º 1
0
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        self.vocab = Vocab(*args, **kwargs)

        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "wt103":
            self.vocab.count_file(os.path.join(path, "train.txt"))
        elif self.dataset == "lm1b":
            train_path_pattern = os.path.join(
                path, "1-billion-word-language-modeling-benchmark-r13output",
                "training-monolingual.tokenized.shuffled", "news.en-*")
            train_paths = glob(train_path_pattern)

            # the vocab will load from file when build_vocab() is called
            # for train_path in sorted(train_paths):
            #   self.vocab.count_file(train_path, verbose=True)

        self.vocab.build_vocab()

        if self.dataset in ["ptb", "wt2", "wt103"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True)
        elif self.dataset in ["enwik8", "text8"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True,
                                               add_eos=False)
        elif self.dataset == "lm1b":
            self.train = train_paths
            valid_path = os.path.join(path, "valid.txt")
            test_path = valid_path
            self.valid = self.vocab.encode_file(valid_path,
                                                ordered=True,
                                                add_double_eos=True)
            self.test = self.vocab.encode_file(test_path,
                                               ordered=True,
                                               add_double_eos=True)

        if self.dataset == "wt103":
            self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)]
        elif self.dataset == "lm1b":
            self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)]
        else:
            self.cutoffs = []
Ejemplo n.º 2
0
def reconcile_dupes(unused_argv):
    """Adds found bond topology ids to the output from topology_from_geom."""
    del unused_argv

    # For each input file, a dataframe.
    df_list: List[pd.Dataframe] = []

    for filepath in gfile.glob(FLAGS.input):
        logging.info("Opening %s", filepath)
        with gfile.GFile(filepath, "r") as f:
            df_list.append(
                pd.read_csv(f,
                            names=[
                                "Smiles", "StartSmi", "id", "Fate", "NBts",
                                "RingAtoms", "IsStart"
                            ]))

    data = pd.concat(df_list)
    del df_list
    logging.info(data.shape)

    # Convert conformer_ids to bond_topology_id by dividing by 1000
    # Expect many dupes to be overwritten here.
    smiles_to_id = {k: v for k, v in zip(data["StartSmi"], data["id"] // 1000)}

    # We only care about the cases where there is a BondTopology mismatch.
    interesting = data.loc[not data["IsStart"]]
    logging.info(interesting.shape)
    # Convert the two smiles columns to molecules.
    mstart = [Chem.MolFromSmiles(x) for x in interesting["StartSmi"]]
    mfound = [Chem.MolFromSmiles(x) for x in interesting["Smiles"]]

    # Ring score for each of the molecules.
    rstart = [ring_atoms(m) for m in mstart]
    rfound = [ring_atoms(m) for m in mfound]
    same_ring_membership = 0
    different_ring_membership = 0
    no_smiles = 0
    print("FoundSmiles,StartSmi,StartId,FoundId,FoundScore,StartScore")
    for i, scores in enumerate(zip(rfound, rstart)):
        found_score = scores[0]
        start_score = scores[1]
        if found_score == start_score:
            same_ring_membership += 1
            continue

        different_ring_membership += 1
        found_smiles = interesting.iloc[i, 0]
        other_bt = smiles_to_id.get(found_smiles, "*")
        if other_bt == "*":
            message = f"smiles {found_smiles}, not known"
            logging.info(message)
            no_smiles += 1
        print(
            f"{interesting.iloc[i,0]},{interesting.iloc[i, 1]},{interesting.iloc[i,2]},{other_bt},{found_score},{start_score}"
        )

    logging.info("%d molecules different smiles but same ring membership",
                 same_ring_membership)
    logging.info("%d of %d items have different ring membership",
                 different_ring_membership, interesting.shape[0])
    logging.info("%d items had unrecognised smiles", no_smiles)
Ejemplo n.º 3
0
    def __init__(self, path, dataset, *args, **kwargs):
        self.dataset = dataset
        if self.dataset == "generic_dataset":
            encode_kwargs = dict(
                add_eos=kwargs.pop('add_eos', False),
                add_double_eos=kwargs.pop('add_double_eos', False),
                ordered=True,
                verbose=True,
            )
            if kwargs.get('vocab_file') is not None:
                kwargs['vocab_file'] = os.path.join(path, kwargs['vocab_file'])

        print(self.dataset, 'vocab params', kwargs)
        self.vocab = Vocab(*args, **kwargs)

        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "generic_dataset" and not self.vocab.vocab_file:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "wt103":
            self.vocab.count_file(os.path.join(path, "train.txt"))
        elif self.dataset == "lm1b":
            train_path_pattern = os.path.join(
                path, "1-billion-word-language-modeling-benchmark-r13output",
                "training-monolingual.tokenized.shuffled", "news.en-*")
            train_paths = glob(train_path_pattern)

            # the vocab will load from file when build_vocab() is called
            # for train_path in sorted(train_paths):
            #   self.vocab.count_file(train_path, verbose=True)

        self.vocab.build_vocab()

        if self.dataset in ["ptb", "wt2", "wt103"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True)
        elif self.dataset == "generic_dataset":
            self.train = self.vocab.encode_file(
                os.path.join(path, "train.txt"), **encode_kwargs)
            self.valid = self.vocab.encode_file(
                os.path.join(path, "valid.txt"), **encode_kwargs)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               **encode_kwargs)
        elif self.dataset in ["enwik8", "text8"]:
            self.train = self.vocab.encode_file(os.path.join(
                path, "train.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.valid = self.vocab.encode_file(os.path.join(
                path, "valid.txt"),
                                                ordered=True,
                                                add_eos=False)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"),
                                               ordered=True,
                                               add_eos=False)
        elif self.dataset == "lm1b":
            self.train = train_paths
            valid_path = os.path.join(path, "valid.txt")
            test_path = valid_path
            self.valid = self.vocab.encode_file(valid_path,
                                                ordered=True,
                                                add_double_eos=True)
            self.test = self.vocab.encode_file(test_path,
                                               ordered=True,
                                               add_double_eos=True)

        if self.dataset == "wt103":
            self.cutoffs = [0, 20000, 40000, 200000] + [len(self.vocab)]
        elif self.dataset == "generic_dataset":
            with open(os.path.join(path, "cutoffs.json")) as f:
                self.cutoffs = json.load(f)
        elif self.dataset == "lm1b":
            self.cutoffs = [0, 60000, 100000, 640000] + [len(self.vocab)]
        else:
            self.cutoffs = []