Esempio n. 1
0
    def read_emails(self, path):
        # Get all files
        files = [f for f in listdir(path) if isfile(join(path, f))]

        try:
            del (files[files.index('DS_Store')])
        except:
            pass

        reader = WordListCorpusReader(path, files)

        cleaner = Cleaner()

        emails = list()

        # Creates the Email Object out of each email file and appends to list
        for file_id in reader.fileids():
            with open(path + file_id, 'r') as current_file:
                cleaned_contents = cleaner.clean_file(current_file.read())
                split_email_header, split_email_body, split_email_file_id = self.divide(
                    cleaned_contents, file_id)
                emails.append(
                    Email(split_email_header, split_email_body,
                          split_email_file_id))

        # Return list of Email objects
        return emails
    def evaluate_tags(gr_email, pred_email):

        cleaner = Cleaner()

        regex = {
            'time': r'<[s|e]time>.*?</[s|e]time>',
            'speaker': r'<speaker>.*?</speaker>',
            'location': r'<location>.*?</location>',
            'sentence': r'<sentence>.*?</sentence>',
            'paragraph': r'<paragraph>.*?</paragraph>'
        }

        # gr_email tags -----------------
        gr_email_header = gr_email.header
        gr_email_body = gr_email.body
        gr_email = gr_email_header + gr_email_body

        gr_email_tags = {}
        # Clean from 'newlines'
        gr_email = gr_email.replace('\n', '')

        for k in regex.keys():
            gr_email_tags[k] = re.findall(regex[k], gr_email, re.MULTILINE)
            for i in range(0, len(gr_email_tags[k])):
                gr_email_tags[k][i] = cleaner.clean_file(gr_email_tags[k][i])

        # pred_email tags -------------------
        pred_email_header = pred_email.header
        pred_email_body = pred_email.body
        pred_email = pred_email_header + pred_email_body

        pred_email_tags = {}
        # Clean from 'newlines'
        pred_email = pred_email.replace('\n', '')

        for k in regex.keys():
            pred_email_tags[k] = re.findall(regex[k], pred_email, re.M)
            for i in range(0, len(pred_email_tags[k])):
                pred_email_tags[k][i] = cleaner.clean_file(pred_email_tags[k][i])

        tp = 0
        fp = 0
        fn = 0

        # change gr_tags.keys() to ['key'] to evaluate a specific tag
        for k in gr_email_tags.keys():
            gr = gr_email_tags[k]
            pred = pred_email_tags[k]

            # removing all punctuations and spaces from both email tag lists
            for i in range(0, len(gr)):
                gr[i] = re.sub(r'[^\w\s]', '', gr[i])
                gr[i] = re.sub(' ', '', gr[i])

            for i in range(0, len(pred)):
                pred[i] = re.sub(r'[^\w\s]', '', pred[i])
                pred[i] = re.sub(' ', '', pred[i])

            # Calculating TP, FP, FN
            for t in gr:
                # print(t)
                if t in pred:
                    # print("Got here")
                    tp = tp + 1
                    pred.remove(t)
                else:
                    # print("Got here")
                    fn = fn + 1

            fp = fp + len(pred)

        return tp, fp, fn