Beispiel #1
0
def process_email(email_contents):
    voca_list = get_vacabulary_list('./vocab.txt')
    word_indices = list()
    email_contents = email_contents.lower()
    email_contents = re.sub(r'<[^<>]+>', ' ', email_contents)
    email_contents = re.sub(r'[0-9]+', ' ', email_contents)
    email_contents = re.sub(r'(http|https)://[^\s]*', 'httpaddr',
                            email_contents)
    email_contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', email_contents)
    email_contents = re.sub(r'[$]+', 'dollar', email_contents)
    print('\n==== Processed Email ====\n\n')

    # python和matlab正则不同,有些符号需要加上转义
    email_contents = re.split(
        r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents)
    for vocabulary in email_contents:
        vocabulary = re.sub(r'[^a-zA-Z0-9]', '', vocabulary)
        vocabulary = PorterStemmer().stem(vocabulary.strip())

        if len(vocabulary) <= 1:
            continue

        # 将对应单词的位置加入list
        if vocabulary in voca_list:
            index = voca_list[vocabulary]
            word_indices.append(index)
        else:
            index = 0
    print('\n\n=========================\n')

    return word_indices
Beispiel #2
0
def train_arrays(headlines, tag):
    threshold = analyse.find_median(analyse.make_list_num(headlines, tag))
    full_text = []
    classes = []
    for headline in headlines:
        text = headline.getElementsByTagName(
            "text")[0].childNodes[0].data.encode('utf-8')
        value = int(headline.getElementsByTagName(tag)[0].childNodes[0].data)
        if value <= threshold:
            classes.append(0)
        else:
            classes.append(1)

    #get rif of all punctuation
        for char in string.punctuation:
            text = text.replace(char, '')

        for w in text.split():
            w = PorterStemmer().stem_word(w)
            full_text.append(w.strip().lower())

    noDub = make_no_dub_list(full_text)
    #noDub=make_no_dub_list(full_text, lambda x: x.lower())
    all_vektors = make_array(noDub, headlines)
    #print len(all_vektors), "  ,   " , len(classes)
    return all_vektors, classes, noDub
Beispiel #3
0
def make_array(noDub, headlines):
    all_vektors = []
    for headline in headlines:
        headline_vektor = [0] * len(noDub)

        text = headline.getElementsByTagName(
            "text")[0].childNodes[0].data.encode('utf-8')
        text = text.split()
        for w in text:

            for char in string.punctuation:
                w = w.replace(char, '')

            w = PorterStemmer().stem_word(w)
            w = w.strip().lower()

            if w in noDub:
                headline_vektor[noDub.index(w)] = 1
        all_vektors.append(headline_vektor)
    return all_vektors
Beispiel #4
0
    def processEmail(self, email_contents):
        """
		PROCESSEMAIL- preprocesses the body of an email and returns a list of word_indices
		word_indices = PROCESSEMAIL(email_contents) preprocesses the body of an email and returns 
		a list of indicies of the words contained in the email.
		"""

        # get Vocabulary
        vocabList = self.getVocabList()

        word_indices = []

        # ========================= Preprocess Email ==========================================
        """
		Find the Headers (\n\n and remove)
		Uncomment the following lines if you are working with raw emails with the full headers
		"""
        # hdstart = email_contents
        # if hdstart:
        #	email_contents = email_contents[hdstart:]

        # All to lower case
        email_contents = email_contents.lower()
        """
		Strip all HTML
		Looks for any expression that starts with < and end with 
		(it doesn't have any < or > in the tag)> and replace it on whitespace
		"""

        email_contents = re.sub('<[^<>]+>', ' ', email_contents)
        """
		Handle numbers (Normilizing numbers)
		Look for one or more charecters between 0-9
		"""

        email_contents = re.sub('[0-9]+', "number", email_contents)
        """
		Handle URLs (Normilizing URLs)
		Look for strings starting with http:// or https:// and repalce on httpaddr
		"""

        email_contents = re.sub('(http|https)://[^\s]*', "httpaddr",
                                email_contents)
        """
		Handle Email Address (Normilizing Email Addresses)
		Look for strings with @ in the middle 
		"""

        email_contents = re.sub('[^s\s]+@[^\s]+', "emailaddr", email_contents)
        """
		Handle $ sign (Normilizing Dollars)
		"""

        email_contents = re.sub('[$]+', "dollar", email_contents)

        # ============================ Tokenize Email =====================================

        print("===== Email Processed =====")

        l = 0

        # Slightly different order from matlab version

        # Split and also get rid of any punctuation
        # regex may need further debugging...

        #.∧_∧
        #( ・ω・。)つ━☆・*。
        #⊂  ノ    ・゜+.
        #しーJ   °。+ *´¨)
        # .· ´¸.·*´¨)
        #  (¸.·´ (¸.·'* Wow such magic (no)
        email_contents = re.split(
            r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents)

        for token in email_contents:

            # 	# Remove any non alphanumeric characters
            token = re.sub("[^a-zA-Z0-9]", '', token)

            # Stem the word

            token = PorterStemmer().stem(token.strip())
            # token = PorterStemmer().stem_word(token.strip())

            if len(token) < 1:
                continue

            indx = self.vocabList[token] if token in self.vocabList else 0

            if indx > 0:
                word_indices.append(indx)

            # print(token)

        print("\n\n================================\n")

        return (word_indices)
Beispiel #5
0
def process_email(email_contents: str) -> List[int]:
    """Pre-process the body of an email and return a list of indices of the
    words contained in the email.

    :param email_contents: the body of an email
    :return: a list of indices of the words contained in the email
    """

    # Load the vocabulary.
    vocabulary_dict = get_vocabulary_dict()

    # Initialize the return value.
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # header_token = '\n\n'
    # header_start = email_contents.find(header_token)
    # email_contents = email_contents[header_start+len(header_token):]

    # Convert email content to lower case.
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle numbers.
    # Convert all sequences of digits (0-9) to a 'number' token.
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLs.
    # Convert all strings starting with http:// or https:// to a 'httpaddr' token.
    email_contents = re.sub('(http://|https://)+\S*', 'httpaddr',
                            email_contents)

    # Handle email addresses.
    # Convert all strings with @ in the middle to a 'emailaddr' token.
    email_contents = re.sub('[\S*]+(@)+\S*', 'emailaddr', email_contents)

    # Handle $ sign
    # Convert all sequences of $ signs to a 'dollar' token.
    email_contents = re.sub('[$]', 'dollar', email_contents)

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n')
    # Process file
    col = 0

    # Tokenize and also get rid of any punctuation
    tokens = re.split('[ @$/#.-:&*\+=\[\]?!\(\)\{\},'
                      '">_<;#\n\r]', email_contents)

    for token in tokens:

        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)

        # Stem the word
        token = PorterStemmer().stem(token.strip())

        # Skip the word if it is too short
        if len(token) < 1:
            continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        for i, word in vocabulary_dict.items():
            if token == word:
                word_indices.append(i)

        # Print to screen, ensuring that the output lines are not too long
        if (col + len(token) + 1) > 78:
            print('')
            col = 0
        print('{} '.format(token), end='', flush=True)
        col = col + len(token) + 1

    # Print footer
    print('\n\n=========================\n')

    return word_indices
Beispiel #6
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices 
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses 
    #   the body of an email and returns a list of indices of the 
    #   words contained in the email. 
    #

    # Load Vocabulary
    vocabList = gvl.getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find("\n\n")
    # if hdrstart:
    #     email_contents = email_contents[hdrstart:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)


    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n')

    # Process file
    l = 0

    # Slightly different order from matlab version

    # Split and also get rid of any punctuation
    # regex may need further debugging...
    email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents)

    for token in email_contents:

        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)

        # Stem the word 
        token = PorterStemmer().stem_word(token.strip())

        # Skip the word if it is too short
        if len(token) < 1:
           continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabList). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabList
        #               'action' appears. For example, if vocabList{18} =
        #               'action', then, you should add 18 to the word_indices 
        #               vector (e.g., word_indices = [word_indices ; 18]; ).
        # 
        # Note: vocabList{idx} returns a the word with index idx in the
        #       vocabulary list.
        # 
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #

        idx = vocabList[token] if token in vocabList else 0

        # only add entries which are in vocabList
        #   i.e. those with ind ~= 0, 
        #        given that ind is assigned 0 if str is not found in vocabList
        if idx > 0:
            word_indices.append(idx)

        # =============================================================


        # Print to screen, ensuring that the output lines are not too long
        if l + len(token) + 1 > 78:
            print("")
            l = 0
        print('{:s}'.format(token)),
        l = l + len(token) + 1

    # Print footer
    print('\n\n=========================\n')

    return word_indices
Beispiel #7
0
def process_email(email_contents: str) -> List[int]:
    """Pre-process the body of an email and return a list of indices of the
    words contained in the email.

    :param email_contents: the body of an email
    :return: a list of indices of the words contained in the email
    """

    vocabulary_dict = get_vocabulary_dict()

    # FIXME: Initialize the return value.
    word_indices = list()

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # header_token = '\n\n'
    # header_start = email_contents.find(header_token)
    # email_contents = email_contents[header_start+len(header_token):]

    # FIXME: Convert email content to lower case.
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # FIXME: Handle numbers.
    # Convert all sequences of digits (0-9) to a 'number' token.
    email_contents = re.sub('\d+', 'number', email_contents)

    # FIXME: Handle URLs.
    # Convert all strings starting with http:// or https:// to a 'httpaddr' token.
    email_contents = re.sub('(http://|https://)[\w\.-]+', 'httpaddr', email_contents)

    # FIXME: Handle email addresses.
    # Convert all strings with @ in the middle to a 'emailaddr' token.
    email_contents = re.sub('[\w\.-]+@[\w\.-]+', 'emailaddr', email_contents)

    # FIXME: Handle $ sign
    # Convert all sequences of $ signs to a 'dollar' token.
    email_contents = re.sub('\$+', 'dollar', email_contents)

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n')

    # Process file
    col = 0

    # Tokenize and also get rid of any punctuation
    tokens = re.split('[ @$/#.-:&*\+=\[\]?!\(\)\{\},''">_<;#\n\r]', email_contents)

    for token in tokens:

        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)

        # Stem the word 
        token = PorterStemmer().stem(token.strip())

        # Skip the word if it is too short
        if len(token) < 1:
            continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        # FIXME: ======================= YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabulary_dict). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabulary_dict
        #               'action' appears. For example, if vocabulary_dict{18} =
        #               'action', then, you should add 18 to the word_indices 
        #               vector (e.g., word_indices = [word_indices ; 18]; ).
        # 
        # Note: vocabulary_dict{idx} returns a the word with index idx in the
        #       vocabulary list.
        # 
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #

        if token in vocabulary_dict.values():
            index = get_key(token, vocabulary_dict)
            if index == 0:
                raise ValueError
            else:
                word_indices.append(index)

        # ========================= END OF YOUR CODE ==========================

        # Print to screen, ensuring that the output lines are not too long
        if (col + len(token) + 1) > 78:
            print('')
            col = 0
        print('{} '.format(token), end='', flush=True)
        col = col + len(tokens) + 1

    # Print footer
    print('\n\n=========================\n')

    return word_indices
Beispiel #8
0
def processEmail(email_contents):
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.

    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find("\n\n")
    # if hdrstart:
    #     email_contents = email_contents[hdrstart:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr',
                            email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n')

    # Process file
    l = 0

    # Slightly different order from matlab version

    # Split and also get rid of any punctuation
    # regex may need further debugging...
    email_contents = re.split(
        r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents)

    for token in email_contents:

        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)

        # Stem the word
        token = PorterStemmer().stem(token.strip())

        # Skip the word if it is too short
        if len(token) < 1:
            continue

        idx = vocabList[token] if token in vocabList else 0

        # only add entries which are in vocabList
        #   i.e. those with ind ~= 0,
        #        given that ind is assigned 0 if str is not found in vocabList
        if idx > 0:
            word_indices.append(idx)

        # Print to screen, ensuring that the output lines are not too long
        if l + len(token) + 1 > 78:
            print("")
            l = 0
        print('{:s}'.format(token)),
        l = l + len(token) + 1

    print('\n\n=========================\n')

    return word_indices