Esempio n. 1
0
def build_extraction_dataset(folder, dataset_filename,
                             sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`.

    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                continue
            lines = msg.splitlines()
            for i in range(1, min(SIGNATURE_MAX_LINES,
                                   len(lines)) + 1):
                line = lines[-i]
                label = -1
                if line[:len(SIGNATURE_ANNOTATION)] == \
                        SIGNATURE_ANNOTATION:
                    label = 1
                    line = line[len(SIGNATURE_ANNOTATION):]
                elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                    line = line[len(REPLY_ANNOTATION):]

                X = build_pattern(line, features(sender))
                X.append(label)
                labeled_pattern = ','.join([str(e) for e in X])
                dataset.write(labeled_pattern + '\n')
Esempio n. 2
0
def build_extraction_dataset(folder, dataset_filename, sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`.

    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                continue
            lines = msg.splitlines()
            for i in range(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1):
                line = lines[-i]
                label = -1
                if line[:len(SIGNATURE_ANNOTATION)] == \
                        SIGNATURE_ANNOTATION:
                    label = 1
                    line = line[len(SIGNATURE_ANNOTATION):]
                elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                    line = line[len(REPLY_ANNOTATION):]

                X = build_pattern(line, features(sender))
                X.append(label)
                labeled_pattern = ','.join([str(e) for e in X])
                dataset.write(labeled_pattern + '\n')
Esempio n. 3
0
def test_build_pattern():
    s = '''John Doe

VP Research and Development, Xxxx Xxxx Xxxxx

555-226-2345

[email protected]'''
    sender = 'John <*****@*****.**>'
    features = fs.features(sender)
    result = fs.build_pattern(s, features)
    eq_(result, [2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1])
Esempio n. 4
0
def build_extraction_dataset(repetition,
                             source_folder,
                             emails,
                             dataset_filename,
                             sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`
    .
    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    global EXTRACTOR_DATA
    dataset_filename = dataset_filename + repetition
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for email in emails:
            filename = source_folder + email
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                #print 'Empty: ' + filename
                continue

            ### Use 2 lines below to save the marked signature part into '*_result' file.
            ##
            result_filename = build_result_filename(filename)
            if os.path.exists(result_filename):
                os.remove(result_filename)
            with open(result_filename, 'a') as result:
                ## indent below after comment is taken off
                lines = msg.splitlines()
                for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1):
                    line = lines[-i]
                    label = -1
                    if line[:len(SIGNATURE_ANNOTATION)] == \
                            SIGNATURE_ANNOTATION:
                        label = 1
                        line = line[len(SIGNATURE_ANNOTATION):]
                        # ##
                        # result.write(line + '\n')
                        # ##
                    elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                        line = line[len(REPLY_ANNOTATION):]
                    X = build_pattern(line, features(sender))
                    X.append(label)
                    labeled_pattern = ','.join([str(e) for e in X])
                    dataset.write(labeled_pattern + '\n')
    return dataset_filename
Esempio n. 5
0
def build_extraction_dataset(folder, dataset_filename,
                             sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`.

    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            #print filename
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                #print 'Empty: ' + filename
                continue
            ## use 2 lines below to pre-process emails to get the body and sender file for later Email Extraction.
            # msg = process(msg,filename,sender)
            # continue

            # ### Use 2 lines below to save the marked signature part into '*_result' file.
            # ##
            # result_filename = build_result_filename(filename)
            # if os.path.exists(result_filename):
            #     os.remove(result_filename)
            # with open(result_filename, 'a') as result:
            # ## indent below after comment is taken off
            lines = msg.splitlines()
            for i in xrange(1, min(SIGNATURE_MAX_LINES,
                                   len(lines)) + 1):
                line = lines[-i]
                label = -1
                if line[:len(SIGNATURE_ANNOTATION)] == \
                        SIGNATURE_ANNOTATION:
                    label = 1
                    line = line[len(SIGNATURE_ANNOTATION):]
                    # ##
                    # result.write(line + '\n')
                    # ##
                elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                    line = line[len(REPLY_ANNOTATION):]
                X = build_pattern(line, features(sender))
                X.append(label)
                labeled_pattern = ','.join([str(e) for e in X])
                dataset.write(labeled_pattern + '\n')
Esempio n. 6
0
def build_extraction_dataset(folder, dataset_filename, sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`.

    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            #print filename
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                #print 'Empty: ' + filename
                continue
            ## use 2 lines below to pre-process emails to get the body and sender file for later Email Extraction.
            # msg = process(msg,filename,sender)
            # continue

            # ### Use 2 lines below to save the marked signature part into '*_result' file.
            # ##
            # result_filename = build_result_filename(filename)
            # if os.path.exists(result_filename):
            #     os.remove(result_filename)
            # with open(result_filename, 'a') as result:
            # ## indent below after comment is taken off
            lines = msg.splitlines()
            for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1):
                line = lines[-i]
                label = -1
                if line[:len(SIGNATURE_ANNOTATION)] == \
                        SIGNATURE_ANNOTATION:
                    label = 1
                    line = line[len(SIGNATURE_ANNOTATION):]
                    # ##
                    # result.write(line + '\n')
                    # ##
                elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                    line = line[len(REPLY_ANNOTATION):]
                X = build_pattern(line, features(sender))
                X.append(label)
                labeled_pattern = ','.join([str(e) for e in X])
                dataset.write(labeled_pattern + '\n')
Esempio n. 7
0
def extract_training_vectors(emails, csv_file, output_extraction_file):
    with open(output_extraction_file, 'w') as dataset:
        with open(csv_file, 'r') as csvinput:
            reader = csv.DictReader(csvinput)
            for row in reader:
                if row['filename'] not in emails:
                    continue
                else:
                    sender = row['sender']
                    lines = set(row['origin'].splitlines())
                    sigs = set(row['sig'].splitlines())
                    n_sigs = lines - sigs
                    for line in lines:
                        label = 0
                        if line in sigs:
                            label = 1
                        X = build_pattern(line, features(sender))
                        X.append(label)
                        labeled_pattern = ','.join([str(e) for e in X])
                        dataset.write(labeled_pattern + '\n')
Esempio n. 8
0
def build_detection_class(folder, dataset_filename, label, sender_known=True):
    """Builds signature detection class.

    Signature detection dataset includes patterns for two classes:
    * class for positive patterns (goes with label 1)
    * class for negative patterns (goes with label -1)

    The patterns are build of emails from `folder` and appended to
    dataset file.

    >>> build_signature_detection_class('emails/P', 'train.data', 1)
    """
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if sender is None or msg is None:
                continue
            msg = re.sub('|'.join(ANNOTATIONS), '', msg)
            X = build_pattern(msg, features(sender))
            X.append(label)
            labeled_pattern = ','.join([str(e) for e in X])
            dataset.write(labeled_pattern + '\n')
Esempio n. 9
0
def build_detection_class(folder, dataset_filename,
                          label, sender_known=True):
    """Builds signature detection class.

    Signature detection dataset includes patterns for two classes:
    * class for positive patterns (goes with label 1)
    * class for negative patterns (goes with label -1)

    The patterns are build of emails from `folder` and appended to
    dataset file.

    >>> build_signature_detection_class('emails/P', 'train.data', 1)
    """
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if sender is None or msg is None:
                continue
            msg = re.sub('|'.join(ANNOTATIONS), '', msg)
            X = build_pattern(msg, features(sender))
            X.append(label)
            labeled_pattern = ','.join([str(e) for e in X])
            dataset.write(labeled_pattern + '\n')
Esempio n. 10
0
def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
    data = SparseDataSet([build_pattern(line, features(sender))])
    return classifier.decisionFunc(data, 0) > 0
Esempio n. 11
0
def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
    data = numpy.array(build_pattern(line, features(sender)))
    return classifier.predict(data) > 0
Esempio n. 12
0
def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
    data = numpy.array(build_pattern(line, features(sender)))
    return classifier.predict(data) > 0
Esempio n. 13
0
def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
    data = SparseDataSet([build_pattern(line, features(sender))])
    return classifier.decisionFunc(data, 0) > 0