Example #1
0
def get_datasets(
    prepared_data: pandas.DataFrame,
    config: Optional[Mapping[str, Any]] = None,
    processes_number: int = DEFAULT_CORRECTOR_CONFIG["processes_number"],
) -> Tuple[pandas.DataFrame, pandas.DataFrame]:
    """
    Create the train and the test datasets of typos.

    1. Take the specified number of lines from the input dataset.
    2. Make artificial typos in picked identifiers and split them into train and test.
    3. Return results.
    :param prepared_data: Dataframe of correct splitted identifiers. Must contain columns \
                          Columns.Split, Columns.Frequency and Columns.Token.
    :param config: Parameters for creating train and test datasets, options:
                   train_size: Train dataset size.
                   test_size: Test dataset size.
                   typo_probability: Probability of token corruption.
                   add_typo_probability: Probability of second corruption for a corrupted token.
                   train_path: Path to the .csv file where to save the train dataset.
                   test_path: Path to the .csv file where to save the test dataset.
    :param processes_number: Number of processes for multiprocessing.
    :return: Train and test datasets.
    """
    log = logging.getLogger("get_datasets")
    if config is None:
        config = {}
    config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["datasets"], config)
    # With replace=True we get the real examples distribution, but there's a small
    # probability of having the same examples of misspellings in train and test datasets
    # (it IS small because a big number of random typos can be made in a single word)
    data = prepared_data[[len(x) > 1 for x in prepared_data[Columns.Token]
                          ]].sample(config["train_size"] + config["test_size"],
                                    weights=Columns.Frequency,
                                    replace=True)
    train, test = train_test_split(data[[Columns.Token, Columns.Split]],
                                   test_size=config["test_size"])
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    log.info("train dataset shape: %s", train.shape)
    log.info("test dataset shape: %s", test.shape)
    train = corrupt_tokens_in_df(train, config["typo_probability"],
                                 config["add_typo_probability"],
                                 processes_number)
    test = corrupt_tokens_in_df(test, config["typo_probability"],
                                config["add_typo_probability"],
                                processes_number)
    if config["test_path"] is not None:
        test.to_csv(config["test_path"])
        log.info("test dataset is saved to %s", config["test_path"])
    if config["train_path"] is not None:
        train.to_csv(config["train_path"])
        log.info("train dataset is saved to %s", config["train_path"])
    return train, test
Example #2
0
 def test_corrupt_tokens_in_df(self):
     data = pandas.DataFrame([["get value", "get"],
                              ["get value", "value"],
                              ["gut", "gut"],
                              ["put tok", "put"],
                              ["put tok", "tok"],
                              ["put", "put"]], columns=[Columns.Split, Columns.Token])
     corrupted = corrupt_tokens_in_df(data, 0.5, 0.1, processes_number=1)
     self.assertSetEqual(set(corrupted.columns), {Columns.Split, Columns.Token,
                                                  Columns.CorrectToken, Columns.CorrectSplit})
     self.assertListEqual(list(corrupted[Columns.CorrectSplit]), list(data[Columns.Split]))
     self.assertListEqual(list(corrupted[Columns.CorrectToken]), list(data[Columns.Token]))
Example #3
0
def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] = None) -> None:
    """
    Train fasttext model on the given dataset of code identifiers.

    :param data: Dataframe with columns Columns.Split and Columns.Frequency.
    :param config: Parameters for training the model, options:
                   size: Number of identifiers to pick from the given data to train fasttext on.
                   corrupt: Value indicating whether to make random artificial typos in \
                            the training data. Identifiers are corrupted with `typo_probability`.
                   typo_probability: Token corruption probability if `corrupt == True`.
                   add_typo_probability: Probability of second corruption in a corrupted token. \
                                         used if `corrupt == True`.
                   path: Path where to store the trained fasttext model.
                   dim: Number of dimensions for embeddings in the new model.
                   bucket: Number of hash buckets to keep in the fasttext model: \
                           the less there are, the more compact the model gets.
                   adjust_frequencies: Whether to divide frequencies by the number of tokens in \
                                       the identifiers. Needs to be done when the result of the \
                                       `prepare` function is used as data to have a true \
                                       identifiers distribution.
    """
    try:
        import fastText
    except ImportError:
        sys.exit("Please install fastText."
                 "Run `pip3 install git+https://github.com/facebookresearch/fastText"
                 "@51e6738d734286251b6ad02e4fdbbcfe5b679382`")
    log = logging.getLogger("train_fasttext")
    if config is None:
        config = {}
    config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["fasttext"], config)
    tokens_number = data[Columns.Split].apply(lambda x: len(str(x).split()))
    if config["adjust_frequencies"]:
        weights = data[Columns.Frequency] / tokens_number
    else:
        weights = data[Columns.Frequency]
    train_data = data[tokens_number > 1].sample(config["size"], weights=weights, replace=True)
    if config["corrupt"]:
        train_data = corrupt_tokens_in_df(train_data, config["typo_probability"],
                                          config["add_typo_probability"])
    with tempfile.NamedTemporaryFile() as ids_file:
        with open(ids_file.name, "w") as f:
            for token_split in train_data[Columns.Split]:
                f.write(token_split + "\n")
        log.info("Training fasttext model...")
        model = fastText.train_unsupervised(ids_file.name, minCount=1, epoch=10,
                                            dim=config["dim"],
                                            bucket=config["bucket"])
    model.save_model(config["path"])
    log.info("fasttext model is saved to %s", config["path"])