Example #1
0
def create_dataset():
    data_toplevel_path = os.path.abspath(
        os.path.join(os.path.dirname(__file__), "..", "examples", "datasets",
                     "csmining", "parsed", "CSDMC2010_SPAM"))
    raw_data_path = os.path.join(data_toplevel_path,
                                 "TRAINING")  # includes validation data too

    examples = list()
    annotations = list()

    with open(os.path.join(data_toplevel_path, "SPAMTrain.label"), "r") as f:
        annotations = [' '.join(line.strip().split()).split() for line in f]

    text = None
    for data_file in os.listdir(raw_data_path):
        with open(os.path.join(raw_data_path, data_file), "r") as f:
            text = f.read()
        text = dict.fromkeys(
            tokenize(' '.join([word for word in text.split()
                               if len(word) > 3])), True)
        examples.append((data_file, text))

    examples = sorted(examples, key=operator.itemgetter(0))
    annotations = sorted(annotations, key=operator.itemgetter(1))

    examples = [feature_vec for data_file, feature_vec in examples]
    annotations = [
        not bool(int(annotation)) for annotation, data_file in annotations
    ]

    spam_examples = list()
    spam_annotations = list()
    ham_examples = list()
    ham_annotations = list()

    def partition_scheme(example, annotation):
        if annotation:
            spam_examples.append(example)
            spam_annotations.append(annotation)
        else:
            ham_examples.append(example)
            ham_annotations.append(annotation)

    abstract_partition(examples, annotations, partition_scheme)

    assert (len(ham_examples) + len(spam_examples) == len(examples))
    return (spam_examples, spam_annotations), (ham_examples, ham_annotations)
Example #2
0
def partition_datasets(spam_dataset, ham_dataset):
    spam_examples, spam_annotations = spam_dataset
    ham_examples, ham_annotations = ham_dataset

    training_examples = list()
    training_annotations = list()

    validation_examples = list()
    validation_annotations = list()

    spam_in_training = list()

    random.seed(12345)
    percent_of_data_in_validation_set = 0.4

    def partition_scheme(example, annotation):
        if random.uniform(0, 1) <= percent_of_data_in_validation_set:
            validation_examples.append(example)
            validation_annotations.append(annotation)
        else:
            training_examples.append(example)
            training_annotations.append(annotation)
            if annotation:
                spam_in_training.append(1)

    abstract_partition(ham_examples, ham_annotations, partition_scheme)
    abstract_partition(spam_examples, spam_annotations, partition_scheme)

    spam_in_training = sum(spam_in_training)

    print("spam_in_training: %s, ham_in_training: %s" %
          (spam_in_training, len(training_examples) - spam_in_training))
    print("spam_in_validation: %s, ham_in_validation: %s" %
          (len(spam_examples) - spam_in_training,
           len(validation_examples) - len(spam_examples) + spam_in_training))

    return (training_examples, training_annotations),\
           (validation_examples, validation_annotations)
Example #3
0
def create_and_partition_dataset():
    data_toplevel_path = os.path.join("/scratch", "madeline", "CSDMC2010_SPAM")
    raw_data_path = os.path.join(data_toplevel_path, "TRAINING")  # includes validation data too

    examples = list()
    annotations = list()

    ### Read in data set ###

    with open(os.path.join(data_toplevel_path, "SPAMTrain.label"), "r") as f:
        annotations = [' '.join(line.strip().split()).split() for line in f]

    text = None
    for data_file in os.listdir(raw_data_path):
        with open(os.path.join(raw_data_path, data_file), "r") as f:
            text = f.read()
        examples.append((data_file, text))

    examples = sorted(examples, key=operator.itemgetter(0))
    annotations = sorted(annotations, key=operator.itemgetter(1))

    examples = [feature_vec for data_file, feature_vec in examples]
    annotations = [not bool(int(annotation)) for annotation, data_file in annotations]

    spam_examples = list()
    spam_annotations = list()
    ham_examples = list()
    ham_annotations = list()

    def partition_scheme(example, annotation):
        if annotation:
            spam_examples.append(example)
            spam_annotations.append(annotation)
        else:
            ham_examples.append(example)
            ham_annotations.append(annotation)
    abstract_partition(examples, annotations, partition_scheme)

    assert(len(ham_examples) + len(spam_examples) == len(examples))

    ### partition it into training and validation sets ###

    training_examples = list()
    training_annotations = list()

    validation_examples = list()
    validation_annotations = list()

    spam_in_training = list()

    random.seed(12345)
    percent_of_data_in_validation_set = 0.4
    def partition_scheme(example, annotation):
        if random.uniform(0, 1) <= percent_of_data_in_validation_set:
            validation_examples.append(example)
            validation_annotations.append(annotation)
        else:
            training_examples.append(example)
            training_annotations.append(annotation)
            if annotation:
                spam_in_training.append(1)

    abstract_partition(ham_examples, ham_annotations, partition_scheme)
    abstract_partition(spam_examples, spam_annotations, partition_scheme)

    spam_in_training = sum(spam_in_training)

    print("spam_in_training: %s, ham_in_training: %s" % (spam_in_training, len(training_examples) - spam_in_training))
    print("spam_in_validation: %s, ham_in_validation: %s" % (len(spam_examples) - spam_in_training, len(validation_examples) - len(spam_examples) + spam_in_training))

    return (training_examples, training_annotations),\
           (validation_examples, validation_annotations)