def create_dataset(): data_toplevel_path = os.path.abspath( os.path.join(os.path.dirname(__file__), "..", "examples", "datasets", "csmining", "parsed", "CSDMC2010_SPAM")) raw_data_path = os.path.join(data_toplevel_path, "TRAINING") # includes validation data too examples = list() annotations = list() with open(os.path.join(data_toplevel_path, "SPAMTrain.label"), "r") as f: annotations = [' '.join(line.strip().split()).split() for line in f] text = None for data_file in os.listdir(raw_data_path): with open(os.path.join(raw_data_path, data_file), "r") as f: text = f.read() text = dict.fromkeys( tokenize(' '.join([word for word in text.split() if len(word) > 3])), True) examples.append((data_file, text)) examples = sorted(examples, key=operator.itemgetter(0)) annotations = sorted(annotations, key=operator.itemgetter(1)) examples = [feature_vec for data_file, feature_vec in examples] annotations = [ not bool(int(annotation)) for annotation, data_file in annotations ] spam_examples = list() spam_annotations = list() ham_examples = list() ham_annotations = list() def partition_scheme(example, annotation): if annotation: spam_examples.append(example) spam_annotations.append(annotation) else: ham_examples.append(example) ham_annotations.append(annotation) abstract_partition(examples, annotations, partition_scheme) assert (len(ham_examples) + len(spam_examples) == len(examples)) return (spam_examples, spam_annotations), (ham_examples, ham_annotations)
def partition_datasets(spam_dataset, ham_dataset): spam_examples, spam_annotations = spam_dataset ham_examples, ham_annotations = ham_dataset training_examples = list() training_annotations = list() validation_examples = list() validation_annotations = list() spam_in_training = list() random.seed(12345) percent_of_data_in_validation_set = 0.4 def partition_scheme(example, annotation): if random.uniform(0, 1) <= percent_of_data_in_validation_set: validation_examples.append(example) validation_annotations.append(annotation) else: training_examples.append(example) training_annotations.append(annotation) if annotation: spam_in_training.append(1) abstract_partition(ham_examples, ham_annotations, partition_scheme) abstract_partition(spam_examples, spam_annotations, partition_scheme) spam_in_training = sum(spam_in_training) print("spam_in_training: %s, ham_in_training: %s" % (spam_in_training, len(training_examples) - spam_in_training)) print("spam_in_validation: %s, ham_in_validation: %s" % (len(spam_examples) - spam_in_training, len(validation_examples) - len(spam_examples) + spam_in_training)) return (training_examples, training_annotations),\ (validation_examples, validation_annotations)
def create_and_partition_dataset(): data_toplevel_path = os.path.join("/scratch", "madeline", "CSDMC2010_SPAM") raw_data_path = os.path.join(data_toplevel_path, "TRAINING") # includes validation data too examples = list() annotations = list() ### Read in data set ### with open(os.path.join(data_toplevel_path, "SPAMTrain.label"), "r") as f: annotations = [' '.join(line.strip().split()).split() for line in f] text = None for data_file in os.listdir(raw_data_path): with open(os.path.join(raw_data_path, data_file), "r") as f: text = f.read() examples.append((data_file, text)) examples = sorted(examples, key=operator.itemgetter(0)) annotations = sorted(annotations, key=operator.itemgetter(1)) examples = [feature_vec for data_file, feature_vec in examples] annotations = [not bool(int(annotation)) for annotation, data_file in annotations] spam_examples = list() spam_annotations = list() ham_examples = list() ham_annotations = list() def partition_scheme(example, annotation): if annotation: spam_examples.append(example) spam_annotations.append(annotation) else: ham_examples.append(example) ham_annotations.append(annotation) abstract_partition(examples, annotations, partition_scheme) assert(len(ham_examples) + len(spam_examples) == len(examples)) ### partition it into training and validation sets ### training_examples = list() training_annotations = list() validation_examples = list() validation_annotations = list() spam_in_training = list() random.seed(12345) percent_of_data_in_validation_set = 0.4 def partition_scheme(example, annotation): if random.uniform(0, 1) <= percent_of_data_in_validation_set: validation_examples.append(example) validation_annotations.append(annotation) else: training_examples.append(example) training_annotations.append(annotation) if annotation: spam_in_training.append(1) abstract_partition(ham_examples, ham_annotations, partition_scheme) abstract_partition(spam_examples, spam_annotations, partition_scheme) spam_in_training = sum(spam_in_training) print("spam_in_training: %s, ham_in_training: %s" % (spam_in_training, len(training_examples) - spam_in_training)) print("spam_in_validation: %s, ham_in_validation: %s" % (len(spam_examples) - spam_in_training, len(validation_examples) - len(spam_examples) + spam_in_training)) return (training_examples, training_annotations),\ (validation_examples, validation_annotations)