Ejemplo n.º 1
0
def __preprocess_input(input_file_path, whether_score):
    """Preprocess the input sentences to fit the format of lasertagger input.

    Args:
        input_file_path: the absolute path to the input file
        whether_score: whether scoring is needed. If scoring is needed, two columns are expected in the input file.
        
    Returns:
        sentences: a list of input sentences
        summaries: a list of summaries
        
    Raises:
        Exception: If scoring is required, but target is not found in the input file
    """
    if not os.path.isfile(os.path.expanduser(input_file_path)):
        __clean_up()
        raise Exception("The input file does not exist")
    print("-------Cleaning inputs-------")
    tsv_file = open(input_file_path)
    read_tsv = csv.reader(tsv_file, delimiter="\t")

    sentences = []
    summaries = []
    for row in read_tsv:
        sentences.append(row[0])
        if whether_score:
            try:
                summaries.append(row[1])
            except IndexError:
                tsv_file.close()
                __clean_up()
                raise Exception(
                    "Whether_score is true. Expected target but only found one column in the input."
                )
    tsv_file.close()

    cleaned_sentences = preprocess_utils.text_strip(sentences)
    if whether_score:
        cleaned_summaries = preprocess_utils.text_strip(summaries)
    else:
        cleaned_summaries = cleaned_sentences

    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)
    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)

    spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences)
    if whether_score:
        spaced_summaries = preprocess_utils.tokenize_with_space(
            cleaned_summaries)
    else:
        spaced_summaries = spaced_sentences

    preprocess_utils.delete_empty_entry(spaced_sentences, spaced_summaries)

    return spaced_sentences, spaced_summaries
Ejemplo n.º 2
0
def main(args):
  """ Preprocess the Reddit dataset.

  Args:
    args: Command line arguments
  Raises:
    ValueError when the number of samples is specified to be negative
  """
  num_of_tuning_sam = args.num_of_tuning
  num_of_valid_sam = args.num_of_validation

  if num_of_valid_sam < 0 or num_of_tuning_sam < 0:
    raise ValueError("Number of samples must be non-negative integers")

  if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)):
    ds = tfds.load('reddit_tifu', split='train', shuffle_files=True)

    sentences = []
    summaries = []
    for row in ds:
      summary = row["title"]
      sentence = row["tldr"]

      sentences.append(sentence.numpy().decode('UTF-8'))
      summaries.append(summary.numpy().decode('UTF-8'))

    cleaned_sentences = preprocess_utils.text_strip(sentences)
    cleaned_summaries = preprocess_utils.text_strip(summaries)

    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)

    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
    print("Number of samples is", len(cleaned_sentences))

    preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries)
    spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences)
    spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries)

    with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file:
      tsv_writer = csv.writer(out_file, delimiter='\t')
      for i in range(len(spaced_sentences)):
        tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]])
    print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH,
          "-------")
  else:
    print("-------Preprocessed data exists. Now splitting dataset.-------")
  print("-------Now splitting dataset.-------")
  preprocess_utils.split_dataset(TRAIN_FILE_PATH,
                                 TUNE_FILE_PATH,
                                 VALID_FILE_PATH,
                                 PREPROCESSED_FILE_PATH,
                                 num_of_tuning_sam,
                                 num_of_valid_sam,
                                 whether_shuffle_entire_set=False,
                                 whether_shuffle_individual_file=True)
Ejemplo n.º 3
0
    def test_text_strip(self):
        # test remove escape characters
        test_case = ["\t", "\r", "\n", "\t" * 3, "\r" * 3, "\n" * 7]
        correct_ans = [" "] * 6
        result = preprocess_utils.text_strip(test_case)
        self.assertEqual(result, correct_ans)

        # test removing redundant special characters
        special_chars = ["_", "+", "-", "~", ":", "."]
        test_case = []
        correct_ans = []
        for char in special_chars:
            test_case.append("text" + char + "text")
            test_case.append("text" + char * 2 + "text")
            correct_ans += (["text" + char + "text"]) * 2
        result = preprocess_utils.text_strip(test_case)
        self.assertEqual(result, correct_ans)

        # test removing -, :, and _ at end or beginning of string (not in the middle)
        special_chars = ["-", ":", "_"]
        test_case = []
        correct_ans = []
        for char in special_chars:
            test_case.append("text" + char + "text")
            test_case.append(char + "text")
            test_case.append("text" + char)
            correct_ans += (["text" + char + "text"])
            correct_ans += (["text"]) * 2
        result = preprocess_utils.text_strip(test_case)
        self.assertEqual(result, correct_ans)

        # test removing ~ at end of string (not in the middle or beginning)
        test_case = ["text~", "text ~ text", "text~text", "~text"]
        correct_ans = ['text', 'text ~ text', 'text~text', '~text']
        result = preprocess_utils.text_strip(test_case)
        self.assertEqual(result, correct_ans)

        # test removing . at beginning of string (not in the middle or end)
        test_case = ["text.", "text . text", "text.text", ".text"]
        correct_ans = ['text.', 'text . text', 'text.text', 'text']
        result = preprocess_utils.text_strip(test_case)
        self.assertEqual(result, correct_ans)
Ejemplo n.º 4
0
def main(args):
  """Preprocess the news dataset.

    Args:
      args: command line arguments
    Raises:
      ValueError when dataset cannot be found in the path provided
    """
  num_of_tuning_sam = args.num_of_tuning
  num_of_valid_sam = args.num_of_validation

  if num_of_valid_sam < 0 or num_of_tuning_sam < 0:
    raise Exception("Number of samples must be non-negative integers")

  data_file_1 = args.news_summary_path
  data_file_2 = args.news_summary_more_path

  if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)):
    if not os.path.isfile(os.path.expanduser(data_file_1)):
      raise ValueError(
          "Cannot find" + os.path.expanduser(data_file_1) +
          ". If necessary, please download from https://www.kaggle.com/sunnysai12345/news-summary"
      )

    if not os.path.isfile(os.path.expanduser(data_file_2)):
      raise ValueError(
          "Cannot find" + os.path.expanduser(data_file_2) +
          ". If necessary, please download from https://www.kaggle.com/sunnysai12345/news-summary"
      )

    dataset1 = (pd.read_csv(data_file_1,
                            encoding='iso-8859-1')).iloc[:, 0:6].copy()
    dataset2 = (pd.read_csv(data_file_2,
                            encoding='iso-8859-1')).iloc[:, 0:2].copy()

    dataset = pd.DataFrame()
    dataset['sentences'] = pd.concat([dataset1['text'], dataset2['text']],
                                     ignore_index=True)
    dataset['summaries'] = pd.concat(
        [dataset1['headlines'], dataset2['headlines']], ignore_index=True)

    cleaned_sentences = preprocess_utils.text_strip(dataset['sentences'])
    cleaned_summaries = preprocess_utils.text_strip(dataset['summaries'])

    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)

    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
    print("Number of samples is", len(cleaned_sentences))

    preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries)
    spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences)
    spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries)

    with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file:
      tsv_writer = csv.writer(out_file, delimiter='\t')
      for i in range(len(spaced_sentences)):
        tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]])
    print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH,
          "-------")
  else:
    print("-------Preprocessed data exists. Now splitting dataset.-------")
  print("-------Now splitting dataset.-------")
  preprocess_utils.split_dataset(TRAIN_FILE_PATH,
                                 TUNE_FILE_PATH,
                                 VALID_FILE_PATH,
                                 PREPROCESSED_FILE_PATH,
                                 num_of_tuning_sam,
                                 num_of_valid_sam,
                                 whether_shuffle_entire_set=False,
                                 whether_shuffle_individual_file=True)
Ejemplo n.º 5
0
def main(args):
    """Preprocess the Microsoft text summarization dataset.

    Args:
        args: command line arguments.
    """
    data_dir = args.raw_data_dir
    if not os.path.isdir(os.path.expanduser(data_dir)):
        raise Exception("Data directory not found.")

    num_of_tuning_sam = args.num_of_tuning
    num_of_valid_sam = args.num_of_validation

    if num_of_valid_sam < 0 or num_of_tuning_sam < 0:
        raise Exception("Number of samples must be non-negative integers")

    train_data_file = data_dir + "/train.tsv"
    train_sentences, train_summaries, train_grammar, train_meaning = __process_file(
        train_data_file)
    test_data_file = data_dir + "/test.tsv"
    test_sentences, test_summaries, test_grammar, test_meaning = __process_file(
        test_data_file)
    valid_data_file = data_dir + "/valid.tsv"
    valid_sentences, valid_summaries, valid_grammar, valid_meaning = __process_file(
        valid_data_file)

    tot_sentences = train_sentences + test_sentences + valid_sentences
    tot_summaries = train_summaries + test_summaries + valid_summaries
    tot_grammar = train_grammar + test_grammar + valid_grammar
    tot_meaning = train_meaning + test_meaning + valid_meaning

    cleaned_sentences = preprocess_utils.text_strip(tot_sentences)
    cleaned_summaries = preprocess_utils.text_strip(tot_summaries)

    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)
    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
    print("Number of samples is", len(cleaned_sentences))

    spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences)
    spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries)

    with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        for i in range(len(spaced_sentences)):
            tsv_writer.writerow([
                spaced_sentences[i], spaced_summaries[i], tot_grammar[i],
                tot_meaning[i]
            ])
    print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH,
          "-------")

    print("-------Now splitting dataset.-------")
    if num_of_tuning_sam + num_of_valid_sam > len(spaced_sentences):
        raise Exception(
            "The number of tuning and validation samples together exceeds the total sample size of "
            + str(len(sentences)))

    sentence_shuffled = []
    summary_shuffled = []
    grammar_shuffled = []
    meaning_shuffled = []

    tune_shuffled = list(range(num_of_tuning_sam))
    random.shuffle(tune_shuffled)
    valid_shuffled = list(
        range(num_of_tuning_sam, num_of_tuning_sam + num_of_valid_sam))
    random.shuffle(valid_shuffled)
    train_shuffled = list(
        range(num_of_tuning_sam + num_of_valid_sam, len(spaced_sentences)))
    random.shuffle(train_shuffled)
    index_shuffled = tune_shuffled + valid_shuffled + train_shuffled

    for i in index_shuffled:
        sentence_shuffled.append(spaced_sentences[i])
        summary_shuffled.append(spaced_summaries[i])
        grammar_shuffled.append(tot_grammar[i])
        meaning_shuffled.append(tot_meaning[i])

    tuning_range = range(num_of_tuning_sam)
    valid_range = range(num_of_tuning_sam,
                        num_of_tuning_sam + num_of_valid_sam)
    training_range = range(num_of_tuning_sam + num_of_valid_sam,
                           len(summary_shuffled))

    output_for_grammar_files = [summary_shuffled, grammar_shuffled]
    __write_to_file(TUNE_FILE_PATH_GRAMMAR, tuning_range,
                    output_for_grammar_files)
    __write_to_file(VALID_FILE_PATH_GRAMMAR, valid_range,
                    output_for_grammar_files)
    __write_to_file(TRAIN_FILE_PATH_GRAMMAR, training_range,
                    output_for_grammar_files)

    output_for_meaning_files = [
        sentence_shuffled, summary_shuffled, meaning_shuffled
    ]
    __write_to_file(TUNE_FILE_PATH_MEANING, tuning_range,
                    output_for_meaning_files)
    __write_to_file(VALID_FILE_PATH_MEANING, valid_range,
                    output_for_meaning_files)
    __write_to_file(TRAIN_FILE_PATH_MEANING, training_range,
                    output_for_meaning_files)
Ejemplo n.º 6
0
def main(args):
    """Preprocess the Microsoft text summarization dataset.

    Args:
      args: command line arguments.
    Raises:
      ValueError when the number of samples is negative.
    """
    data_dir = args.raw_data_dir
    if not os.path.isdir(os.path.expanduser(data_dir)):
        raise Exception("Data directory not found.")

    num_of_tuning_sam = args.num_of_tuning
    num_of_valid_sam = args.num_of_validation

    if num_of_valid_sam < 0 or num_of_tuning_sam < 0:
        raise ValueError("Number of samples must be non-negative integers")

    if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)):
        train_data_file = data_dir + "/train.tsv"
        train_sentences, train_summaries, train_ratings, train_excluded = __process_file(
            train_data_file)
        test_data_file = data_dir + "/test.tsv"
        test_sentences, test_summaries, test_ratings, test_excluded = __process_file(
            test_data_file)
        valid_data_file = data_dir + "/valid.tsv"
        valid_sentences, valid_summaries, valid_ratings, valid_excluded = __process_file(
            valid_data_file)

        tot_sentences = train_sentences + test_sentences + valid_sentences
        tot_summaries = train_summaries + test_summaries + valid_summaries
        tot_ratings = train_ratings + test_ratings + valid_ratings
        tot_excluded = train_excluded + test_excluded + valid_excluded

        cleaned_sentences = preprocess_utils.text_strip(tot_sentences)
        cleaned_summaries = preprocess_utils.text_strip(tot_summaries)

        cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
            cleaned_sentences, cleaned_summaries)
        preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
        print("Number of samples is", len(cleaned_sentences))
        print("Total number of excluded sample is", tot_excluded)

        preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries)
        spaced_sentences = preprocess_utils.tokenize_with_space(
            cleaned_sentences)
        spaced_summaries = preprocess_utils.tokenize_with_space(
            cleaned_summaries)

        with open(os.path.expanduser(PREPROCESSED_FILE_PATH),
                  'wt') as out_file:
            tsv_writer = csv.writer(out_file, delimiter='\t')
            for i in range(len(spaced_sentences)):
                tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]])
        print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH,
              "-------")
    else:
        print("-------Preprocessed data exists. Now splitting dataset.-------")
    print("-------Now splitting dataset.-------")
    preprocess_utils.split_dataset(TRAIN_FILE_PATH,
                                   TUNE_FILE_PATH,
                                   VALID_FILE_PATH,
                                   PREPROCESSED_FILE_PATH,
                                   num_of_tuning_sam,
                                   num_of_valid_sam,
                                   whether_shuffle_entire_set=False,
                                   whether_shuffle_individual_file=True)
Ejemplo n.º 7
0
def __format_data():
    """ Format the dataset and clean up special characters.

    Returns:
        cleaned_sentences: a list of cleaned input sentences
        cleaned_summaries: a list of cleaned summaries corresponding to the input sentences
    """
    print("-------Processing original sentences-------")
    for i in range(1, 11):
        subprocess.call('cat sent-comp.train' + str(i).zfill(2) +
                        '.json | grep \'"sentence":\' > ~/' +
                        TEMP_FOLDER_NAME + '/train' + str(i) + '.txt',
                        shell=True,
                        cwd=os.path.expanduser(DATASET_DIR))

    subprocess.call('cat comp-data.eval.json | grep \'"sentence":\' > ~/' +
                    TEMP_FOLDER_NAME + '/train11.txt',
                    shell=True,
                    cwd=os.path.expanduser(DATASET_DIR))

    sentences = []
    for i in range(1, 12):
        file_name = os.path.expanduser(TEMP_FOLDER_NAME) + '/train' + str(
            i) + '.txt'
        f = open(file_name, "r")
        odd_line = True
        for line in f:
            if odd_line:
                sentences.append(line[17:-3])
            odd_line = not odd_line
        f.close()
    cleaned_sentences = preprocess_utils.text_strip(sentences)

    print("-------Processing summaries-------")
    for i in range(1, 11):
        subprocess.call('cat sent-comp.train' + str(i).zfill(2) +
                        '.json | grep \'"headline":\' > ~/' +
                        TEMP_FOLDER_NAME + '/train' + str(i) + '.txt',
                        shell=True,
                        cwd=os.path.expanduser(DATASET_DIR))

    subprocess.call('cat comp-data.eval.json | grep \'"headline":\' > ~/' +
                    TEMP_FOLDER_NAME + '/train11.txt',
                    shell=True,
                    cwd=os.path.expanduser(DATASET_DIR))

    summaries = []
    for i in range(1, 12):
        file_name = os.path.expanduser(TEMP_FOLDER_NAME) + '/train' + str(
            i) + '.txt'
        f = open(file_name, "r")
        for line in f:
            summaries.append(line[15:-3])
        f.close()

    cleaned_summaries = preprocess_utils.text_strip(summaries)
    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)
    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
    print("Number of samples is", len(cleaned_sentences))

    return cleaned_sentences, cleaned_summaries
Ejemplo n.º 8
0
def main(args):
    """Preprocess the CoLA grammar dataset.
    Args:
        args: command line arguments.
    """
    data_file = os.path.expanduser(args.raw_data_file)
    if not os.path.isfile(data_file):
        raise Exception("Data file not found.")

    sentences_positive = []
    sentences_negative = []

    with open(data_file) as tsv_file:
        read_tsv = csv.reader(tsv_file, delimiter="\t")
        for line in read_tsv:
            if int(line[1]) == 1:
                sentences_positive.append(line[3])
            else:
                sentences_negative.append(line[3])

    cleaned_sentences_positive = preprocess_utils.text_strip(
        sentences_positive)
    cleaned_sentences_negative = preprocess_utils.text_strip(
        sentences_negative)

    print("Number of samples is",
          len(cleaned_sentences_positive) + len(cleaned_sentences_negative))
    print("Number of incorrect sample is", len(cleaned_sentences_negative),
          "and number of correct sample is", len(cleaned_sentences_positive))

    spaced_sentences_positive = preprocess_utils.tokenize_with_space(
        cleaned_sentences_positive)
    spaced_sentences_negative = preprocess_utils.tokenize_with_space(
        cleaned_sentences_negative)

    with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        for positive_sentence in spaced_sentences_positive:
            tsv_writer.writerow([positive_sentence, "1"])
        for negative_sentence in spaced_sentences_negative:
            tsv_writer.writerow([negative_sentence, "0"])
    print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH,
          "-------")

    print("-------Now mixing dataset with the MS dataset.-------")
    MS_data_file = os.path.expanduser(args.MS_data_file)
    if not os.path.isfile(MS_data_file):
        raise Exception("Microsoft data file not found.")

    MS_sentences = []
    MS_ratings = []
    number_of_MS_samples_in_each_category = [0, 0]

    with open(MS_data_file) as tsv_file:
        read_tsv = csv.reader(tsv_file, delimiter="\t")
        for line in read_tsv:
            MS_sentences.append(line[0])
            MS_ratings.append(int(line[1]))
            number_of_MS_samples_in_each_category[int(line[1])] += 1

    max_negative_rate = (number_of_MS_samples_in_each_category[0] +
                         len(cleaned_sentences_negative)) / \
                        (sum(number_of_MS_samples_in_each_category) + len(cleaned_sentences_negative))
    min_negative_rate = (number_of_MS_samples_in_each_category[0] +
                         len(cleaned_sentences_negative)) / \
                        (sum(number_of_MS_samples_in_each_category) + len(cleaned_sentences_positive) +
                         len(cleaned_sentences_negative))

    goal_percentage = args.goal_percentage_of_neg_samples
    if goal_percentage is None:
        number_of_pos_sample_to_include = 0
    else:
        if goal_percentage > max_negative_rate:
            raise Exception(
                "The goal negative sample percentage is greater than the largest"
                "possible value {:.2f}".format(max_negative_rate))

        if goal_percentage < min_negative_rate:
            raise Exception(
                "The goal negative sample percentage is smaller than the smallest"
                "possible value {:.2f}".format(min_negative_rate))

        number_of_pos_sample_to_include = int(
            (1 - goal_percentage) / goal_percentage *
            (len(cleaned_sentences_negative) +
             number_of_MS_samples_in_each_category[0]) -
            number_of_MS_samples_in_each_category[1])

        print("------- Including", number_of_pos_sample_to_include,
              "samples from the cola dataset.")

    MS_sentences = MS_sentences + spaced_sentences_positive[0:number_of_pos_sample_to_include] + \
                   spaced_sentences_negative
    MS_ratings = MS_ratings + [1] * number_of_pos_sample_to_include + [
        0
    ] * len(spaced_sentences_negative)

    actual_negative_rate = (number_of_MS_samples_in_each_category[0] +
                            len(spaced_sentences_negative)) / \
                           (sum(number_of_MS_samples_in_each_category) +
                            len(spaced_sentences_negative) + number_of_pos_sample_to_include)

    print("-------The percentage of negative sample is",
          "{:.2f}".format(actual_negative_rate), "-------")

    shuffled_index = list(range(len(MS_sentences)))
    random.shuffle(shuffled_index)

    with open(os.path.expanduser(MIXED_FILE_PATH), 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        for index in shuffled_index:
            tsv_writer.writerow([MS_sentences[index], MS_ratings[index]])

    print("-------", len(MS_sentences), "samples saved to", MIXED_FILE_PATH,
          "-------")