def main(): input_file, output_prefix = sys.argv[1:] dataset = [] labels = ["1.0", "2.0", "3.0", "4.0", "5.0"] label_map = dl.make_label_map(labels) with open(input_file, 'r') as in_file: for line in in_file: (avg_score, _, p1, p2, _) = line.split("\t") p1_words = [word.lower() for word in p1.split()] p2_words = [word.lower() for word in p2.split()] label = str(ceil(float(avg_score))) if label == '0.0': label = '1.0' dataset.append((p1_words, p2_words, label_map[label])) random.Random(137).shuffle(dataset) train_len = len(dataset) - 4000 dev_len = int(0.1 * float(train_len)) dev = dataset[4000:(4000 + dev_len)] train = dataset[(4000 + dev_len):] for dataset, filename in zip([train, dev], ["train", "dev"]): out_file = output_prefix + filename + ".pickle" dl.make_pickle(dataset, label_map, out_file)
def main(): input_prefix, output_prefix = sys.argv[1:] train = read_dataset_sentences(input_prefix + '/train.label') for dataset, filename in zip([train], ["train"]): out_file = output_prefix + filename + ".pickle" labels = sorted(list(set([label for question, label in dataset]))) label_map = dl.make_label_map(labels) new_dataset = [(question, label_map[label]) for question, label in dataset] dl.make_pickle(new_dataset, label_map, out_file)
def main(): input_prefix, output_prefix, window_size = sys.argv[1:] window_size = int(window_size) train = read_dataset_sentences(input_prefix + '/train.txt', window_size) valid = read_dataset_sentences(input_prefix + '/dev.txt', window_size) train_windows = make_windows(train, window_size) valid_windows = make_windows(valid, window_size) for dataset, filename in zip([train_windows, valid_windows], ["train", "dev"]): out_file = output_prefix + filename + ".pickle" labels = sorted(list(set([label for window, label in dataset]))) label_map = dl.make_label_map(labels) new_dataset = [(window, label_map[label]) for window, label in dataset] dl.make_pickle(new_dataset, label_map, out_file)
def main(): input_prefix, output_prefix, window_size = sys.argv[1:] window_size = int(window_size) train = read_dataset_sentences(input_prefix + '/train.txt', window_size) train_windows = make_windows(train, window_size) train_labels = sorted(list(set([label for window, label in train_windows]))) train_labels.append('I-LST') # Missing from training set labels = sorted(set(train_labels)) label_map = dl.make_label_map(labels) for dataset, filename in zip([train_windows], ["train"]): out_file = output_prefix + filename + ".pickle" new_dataset = [(window, label_map[label]) for window, label in dataset] dl.make_pickle(new_dataset, label_map, out_file)
def construct_sentiment_dataset(input_prefix): binary_label_map = { 0.0: 0.0, 1.0: 0.0, 2.0: 0.0, 3.0: None, 4.0: 1.0, 5.0: 1.0 } dataset_sentences = read_dataset_sentences(input_prefix) dictionary = read_dictionary(input_prefix) sentiment_labels = read_sentiment_labels(input_prefix) train, test, dev = read_dataset_split(input_prefix) new_train = [] new_test = [] new_dev = [] for old, new in zip([train, test, dev], [new_train, new_test, new_dev]): temp = [] labels = set() for sent_id in old: phrase = dataset_sentences[sent_id] phrase = phrase.replace("-LRB-", "(").replace("-RRB-", ")") string_label = sentiment_labels[dictionary[phrase]] binary_sentiment_label = get_binary_label(binary_label_map, string_label) if binary_sentiment_label is not None: temp.append((phrase.split(), binary_sentiment_label)) labels.add(binary_sentiment_label) label_map = dl.make_label_map(sorted(list(labels))) for phrase, label in temp: lowered_phrase = [word.lower() for word in phrase] new.append((lowered_phrase, label_map[label])) return new_train, new_test, new_dev, label_map