Example #1
0
    def test_delete_empty_entry(self):
        summaries = []
        sentences = []

        for i in range(3):
            summaries += ["text " + str(i)]
            sentences += ["sum " + str(i)]
        summaries += ["  "] * 2
        sentences = ["text"] * 5

        sentences_cleaned, summaries_cleaned = preprocess_utils.delete_empty_entry(
            sentences, summaries)

        self.assertEqual(len(sentences_cleaned), 3)
        self.assertEqual(len(summaries_cleaned), 3)

        try:
            preprocess_utils.validate_dataset(sentences_cleaned,
                                              summaries_cleaned)
        except:
            self.fail("validate_dataset raised Exception unexpectedly!")

        for sentence in sentences_cleaned:
            self.assertEqual(
                summaries[sentences.index(sentence)],
                summaries_cleaned[sentences_cleaned.index(sentence)])
Example #2
0
def main(args):
  """ Preprocess the Reddit dataset.

  Args:
    args: Command line arguments
  Raises:
    ValueError when the number of samples is specified to be negative
  """
  num_of_tuning_sam = args.num_of_tuning
  num_of_valid_sam = args.num_of_validation

  if num_of_valid_sam < 0 or num_of_tuning_sam < 0:
    raise ValueError("Number of samples must be non-negative integers")

  if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)):
    ds = tfds.load('reddit_tifu', split='train', shuffle_files=True)

    sentences = []
    summaries = []
    for row in ds:
      summary = row["title"]
      sentence = row["tldr"]

      sentences.append(sentence.numpy().decode('UTF-8'))
      summaries.append(summary.numpy().decode('UTF-8'))

    cleaned_sentences = preprocess_utils.text_strip(sentences)
    cleaned_summaries = preprocess_utils.text_strip(summaries)

    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)

    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
    print("Number of samples is", len(cleaned_sentences))

    preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries)
    spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences)
    spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries)

    with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file:
      tsv_writer = csv.writer(out_file, delimiter='\t')
      for i in range(len(spaced_sentences)):
        tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]])
    print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH,
          "-------")
  else:
    print("-------Preprocessed data exists. Now splitting dataset.-------")
  print("-------Now splitting dataset.-------")
  preprocess_utils.split_dataset(TRAIN_FILE_PATH,
                                 TUNE_FILE_PATH,
                                 VALID_FILE_PATH,
                                 PREPROCESSED_FILE_PATH,
                                 num_of_tuning_sam,
                                 num_of_valid_sam,
                                 whether_shuffle_entire_set=False,
                                 whether_shuffle_individual_file=True)
Example #3
0
def __preprocess_input(input_file_path, whether_score):
    """Preprocess the input sentences to fit the format of lasertagger input.

    Args:
        input_file_path: the absolute path to the input file
        whether_score: whether scoring is needed. If scoring is needed, two columns are expected in the input file.
        
    Returns:
        sentences: a list of input sentences
        summaries: a list of summaries
        
    Raises:
        Exception: If scoring is required, but target is not found in the input file
    """
    if not os.path.isfile(os.path.expanduser(input_file_path)):
        __clean_up()
        raise Exception("The input file does not exist")
    print("-------Cleaning inputs-------")
    tsv_file = open(input_file_path)
    read_tsv = csv.reader(tsv_file, delimiter="\t")

    sentences = []
    summaries = []
    for row in read_tsv:
        sentences.append(row[0])
        if whether_score:
            try:
                summaries.append(row[1])
            except IndexError:
                tsv_file.close()
                __clean_up()
                raise Exception(
                    "Whether_score is true. Expected target but only found one column in the input."
                )
    tsv_file.close()

    cleaned_sentences = preprocess_utils.text_strip(sentences)
    if whether_score:
        cleaned_summaries = preprocess_utils.text_strip(summaries)
    else:
        cleaned_summaries = cleaned_sentences

    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)
    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)

    spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences)
    if whether_score:
        spaced_summaries = preprocess_utils.tokenize_with_space(
            cleaned_summaries)
    else:
        spaced_summaries = spaced_sentences

    preprocess_utils.delete_empty_entry(spaced_sentences, spaced_summaries)

    return spaced_sentences, spaced_summaries
Example #4
0
    def test_validate_dataset_unequal(self):
        sentences = ["text"] * 5
        summaries = ["text"] * 7
        with self.assertRaises(Exception):
            preprocess_utils.validate_dataset(sentences, summaries)

        sentences = ["text"] * 7
        summaries = ["text"] * 7
        try:
            preprocess_utils.validate_dataset(sentences, summaries)
        except:
            self.fail("validate_dataset raised Exception unexpectedly!")
Example #5
0
    def test_preprocess_input_without_scoring_and_only_one_rows(self):
        with open(TEMP_TESTING_FILE, "wt") as f:
            tsv_writer = csv.writer(f, delimiter='\t')
            for i in range(10):
                tsv_writer.writerow(["Sample" + str(i)])

        sentences, summaries = preprocess_input(
            input_file_path=TEMP_TESTING_FILE, whether_score=False)

        preprocess_utils.validate_dataset(sentences, summaries)
        for i in range(10):
            self.assertEqual(sentences[i], "Sample" + str(i))
            self.assertEqual(summaries[i], "Sample" + str(i))
Example #6
0
    def test_validate_dataset_empty(self):
        sentences = [" "] * 5
        summaries = ["text"] * 5
        with self.assertRaises(Exception):
            preprocess_utils.validate_dataset(sentences, summaries)

        summaries = [" "] * 5
        sentences = ["text"] * 5
        with self.assertRaises(Exception):
            preprocess_utils.validate_dataset(sentences, summaries)

        summaries = ["text"] * 3 + ["   "] * 2
        sentences = ["text"] * 5
        with self.assertRaises(Exception):
            preprocess_utils.validate_dataset(sentences, summaries)
Example #7
0
def main(args):
  """Preprocess the news dataset.

    Args:
      args: command line arguments
    Raises:
      ValueError when dataset cannot be found in the path provided
    """
  num_of_tuning_sam = args.num_of_tuning
  num_of_valid_sam = args.num_of_validation

  if num_of_valid_sam < 0 or num_of_tuning_sam < 0:
    raise Exception("Number of samples must be non-negative integers")

  data_file_1 = args.news_summary_path
  data_file_2 = args.news_summary_more_path

  if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)):
    if not os.path.isfile(os.path.expanduser(data_file_1)):
      raise ValueError(
          "Cannot find" + os.path.expanduser(data_file_1) +
          ". If necessary, please download from https://www.kaggle.com/sunnysai12345/news-summary"
      )

    if not os.path.isfile(os.path.expanduser(data_file_2)):
      raise ValueError(
          "Cannot find" + os.path.expanduser(data_file_2) +
          ". If necessary, please download from https://www.kaggle.com/sunnysai12345/news-summary"
      )

    dataset1 = (pd.read_csv(data_file_1,
                            encoding='iso-8859-1')).iloc[:, 0:6].copy()
    dataset2 = (pd.read_csv(data_file_2,
                            encoding='iso-8859-1')).iloc[:, 0:2].copy()

    dataset = pd.DataFrame()
    dataset['sentences'] = pd.concat([dataset1['text'], dataset2['text']],
                                     ignore_index=True)
    dataset['summaries'] = pd.concat(
        [dataset1['headlines'], dataset2['headlines']], ignore_index=True)

    cleaned_sentences = preprocess_utils.text_strip(dataset['sentences'])
    cleaned_summaries = preprocess_utils.text_strip(dataset['summaries'])

    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)

    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
    print("Number of samples is", len(cleaned_sentences))

    preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries)
    spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences)
    spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries)

    with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file:
      tsv_writer = csv.writer(out_file, delimiter='\t')
      for i in range(len(spaced_sentences)):
        tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]])
    print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH,
          "-------")
  else:
    print("-------Preprocessed data exists. Now splitting dataset.-------")
  print("-------Now splitting dataset.-------")
  preprocess_utils.split_dataset(TRAIN_FILE_PATH,
                                 TUNE_FILE_PATH,
                                 VALID_FILE_PATH,
                                 PREPROCESSED_FILE_PATH,
                                 num_of_tuning_sam,
                                 num_of_valid_sam,
                                 whether_shuffle_entire_set=False,
                                 whether_shuffle_individual_file=True)
Example #8
0
def main(args):
    """Preprocess the Microsoft text summarization dataset.

    Args:
        args: command line arguments.
    """
    data_dir = args.raw_data_dir
    if not os.path.isdir(os.path.expanduser(data_dir)):
        raise Exception("Data directory not found.")

    num_of_tuning_sam = args.num_of_tuning
    num_of_valid_sam = args.num_of_validation

    if num_of_valid_sam < 0 or num_of_tuning_sam < 0:
        raise Exception("Number of samples must be non-negative integers")

    train_data_file = data_dir + "/train.tsv"
    train_sentences, train_summaries, train_grammar, train_meaning = __process_file(
        train_data_file)
    test_data_file = data_dir + "/test.tsv"
    test_sentences, test_summaries, test_grammar, test_meaning = __process_file(
        test_data_file)
    valid_data_file = data_dir + "/valid.tsv"
    valid_sentences, valid_summaries, valid_grammar, valid_meaning = __process_file(
        valid_data_file)

    tot_sentences = train_sentences + test_sentences + valid_sentences
    tot_summaries = train_summaries + test_summaries + valid_summaries
    tot_grammar = train_grammar + test_grammar + valid_grammar
    tot_meaning = train_meaning + test_meaning + valid_meaning

    cleaned_sentences = preprocess_utils.text_strip(tot_sentences)
    cleaned_summaries = preprocess_utils.text_strip(tot_summaries)

    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)
    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
    print("Number of samples is", len(cleaned_sentences))

    spaced_sentences = preprocess_utils.tokenize_with_space(cleaned_sentences)
    spaced_summaries = preprocess_utils.tokenize_with_space(cleaned_summaries)

    with open(os.path.expanduser(PREPROCESSED_FILE_PATH), 'wt') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        for i in range(len(spaced_sentences)):
            tsv_writer.writerow([
                spaced_sentences[i], spaced_summaries[i], tot_grammar[i],
                tot_meaning[i]
            ])
    print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH,
          "-------")

    print("-------Now splitting dataset.-------")
    if num_of_tuning_sam + num_of_valid_sam > len(spaced_sentences):
        raise Exception(
            "The number of tuning and validation samples together exceeds the total sample size of "
            + str(len(sentences)))

    sentence_shuffled = []
    summary_shuffled = []
    grammar_shuffled = []
    meaning_shuffled = []

    tune_shuffled = list(range(num_of_tuning_sam))
    random.shuffle(tune_shuffled)
    valid_shuffled = list(
        range(num_of_tuning_sam, num_of_tuning_sam + num_of_valid_sam))
    random.shuffle(valid_shuffled)
    train_shuffled = list(
        range(num_of_tuning_sam + num_of_valid_sam, len(spaced_sentences)))
    random.shuffle(train_shuffled)
    index_shuffled = tune_shuffled + valid_shuffled + train_shuffled

    for i in index_shuffled:
        sentence_shuffled.append(spaced_sentences[i])
        summary_shuffled.append(spaced_summaries[i])
        grammar_shuffled.append(tot_grammar[i])
        meaning_shuffled.append(tot_meaning[i])

    tuning_range = range(num_of_tuning_sam)
    valid_range = range(num_of_tuning_sam,
                        num_of_tuning_sam + num_of_valid_sam)
    training_range = range(num_of_tuning_sam + num_of_valid_sam,
                           len(summary_shuffled))

    output_for_grammar_files = [summary_shuffled, grammar_shuffled]
    __write_to_file(TUNE_FILE_PATH_GRAMMAR, tuning_range,
                    output_for_grammar_files)
    __write_to_file(VALID_FILE_PATH_GRAMMAR, valid_range,
                    output_for_grammar_files)
    __write_to_file(TRAIN_FILE_PATH_GRAMMAR, training_range,
                    output_for_grammar_files)

    output_for_meaning_files = [
        sentence_shuffled, summary_shuffled, meaning_shuffled
    ]
    __write_to_file(TUNE_FILE_PATH_MEANING, tuning_range,
                    output_for_meaning_files)
    __write_to_file(VALID_FILE_PATH_MEANING, valid_range,
                    output_for_meaning_files)
    __write_to_file(TRAIN_FILE_PATH_MEANING, training_range,
                    output_for_meaning_files)
Example #9
0
def main(args):
    """Preprocess the Microsoft text summarization dataset.

    Args:
      args: command line arguments.
    Raises:
      ValueError when the number of samples is negative.
    """
    data_dir = args.raw_data_dir
    if not os.path.isdir(os.path.expanduser(data_dir)):
        raise Exception("Data directory not found.")

    num_of_tuning_sam = args.num_of_tuning
    num_of_valid_sam = args.num_of_validation

    if num_of_valid_sam < 0 or num_of_tuning_sam < 0:
        raise ValueError("Number of samples must be non-negative integers")

    if not os.path.isfile(os.path.expanduser(PREPROCESSED_FILE_PATH)):
        train_data_file = data_dir + "/train.tsv"
        train_sentences, train_summaries, train_ratings, train_excluded = __process_file(
            train_data_file)
        test_data_file = data_dir + "/test.tsv"
        test_sentences, test_summaries, test_ratings, test_excluded = __process_file(
            test_data_file)
        valid_data_file = data_dir + "/valid.tsv"
        valid_sentences, valid_summaries, valid_ratings, valid_excluded = __process_file(
            valid_data_file)

        tot_sentences = train_sentences + test_sentences + valid_sentences
        tot_summaries = train_summaries + test_summaries + valid_summaries
        tot_ratings = train_ratings + test_ratings + valid_ratings
        tot_excluded = train_excluded + test_excluded + valid_excluded

        cleaned_sentences = preprocess_utils.text_strip(tot_sentences)
        cleaned_summaries = preprocess_utils.text_strip(tot_summaries)

        cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
            cleaned_sentences, cleaned_summaries)
        preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
        print("Number of samples is", len(cleaned_sentences))
        print("Total number of excluded sample is", tot_excluded)

        preprocess_utils.calculate_stats(cleaned_sentences, cleaned_summaries)
        spaced_sentences = preprocess_utils.tokenize_with_space(
            cleaned_sentences)
        spaced_summaries = preprocess_utils.tokenize_with_space(
            cleaned_summaries)

        with open(os.path.expanduser(PREPROCESSED_FILE_PATH),
                  'wt') as out_file:
            tsv_writer = csv.writer(out_file, delimiter='\t')
            for i in range(len(spaced_sentences)):
                tsv_writer.writerow([spaced_sentences[i], spaced_summaries[i]])
        print("-------Preprocessed data saved to", PREPROCESSED_FILE_PATH,
              "-------")
    else:
        print("-------Preprocessed data exists. Now splitting dataset.-------")
    print("-------Now splitting dataset.-------")
    preprocess_utils.split_dataset(TRAIN_FILE_PATH,
                                   TUNE_FILE_PATH,
                                   VALID_FILE_PATH,
                                   PREPROCESSED_FILE_PATH,
                                   num_of_tuning_sam,
                                   num_of_valid_sam,
                                   whether_shuffle_entire_set=False,
                                   whether_shuffle_individual_file=True)
Example #10
0
def __format_data():
    """ Format the dataset and clean up special characters.

    Returns:
        cleaned_sentences: a list of cleaned input sentences
        cleaned_summaries: a list of cleaned summaries corresponding to the input sentences
    """
    print("-------Processing original sentences-------")
    for i in range(1, 11):
        subprocess.call('cat sent-comp.train' + str(i).zfill(2) +
                        '.json | grep \'"sentence":\' > ~/' +
                        TEMP_FOLDER_NAME + '/train' + str(i) + '.txt',
                        shell=True,
                        cwd=os.path.expanduser(DATASET_DIR))

    subprocess.call('cat comp-data.eval.json | grep \'"sentence":\' > ~/' +
                    TEMP_FOLDER_NAME + '/train11.txt',
                    shell=True,
                    cwd=os.path.expanduser(DATASET_DIR))

    sentences = []
    for i in range(1, 12):
        file_name = os.path.expanduser(TEMP_FOLDER_NAME) + '/train' + str(
            i) + '.txt'
        f = open(file_name, "r")
        odd_line = True
        for line in f:
            if odd_line:
                sentences.append(line[17:-3])
            odd_line = not odd_line
        f.close()
    cleaned_sentences = preprocess_utils.text_strip(sentences)

    print("-------Processing summaries-------")
    for i in range(1, 11):
        subprocess.call('cat sent-comp.train' + str(i).zfill(2) +
                        '.json | grep \'"headline":\' > ~/' +
                        TEMP_FOLDER_NAME + '/train' + str(i) + '.txt',
                        shell=True,
                        cwd=os.path.expanduser(DATASET_DIR))

    subprocess.call('cat comp-data.eval.json | grep \'"headline":\' > ~/' +
                    TEMP_FOLDER_NAME + '/train11.txt',
                    shell=True,
                    cwd=os.path.expanduser(DATASET_DIR))

    summaries = []
    for i in range(1, 12):
        file_name = os.path.expanduser(TEMP_FOLDER_NAME) + '/train' + str(
            i) + '.txt'
        f = open(file_name, "r")
        for line in f:
            summaries.append(line[15:-3])
        f.close()

    cleaned_summaries = preprocess_utils.text_strip(summaries)
    cleaned_sentences, cleaned_summaries = preprocess_utils.delete_empty_entry(
        cleaned_sentences, cleaned_summaries)
    preprocess_utils.validate_dataset(cleaned_sentences, cleaned_summaries)
    print("Number of samples is", len(cleaned_sentences))

    return cleaned_sentences, cleaned_summaries