Ejemplo n.º 1
0
def tokenize_on_porter(text):
    word_list = []
    p = PorterStemmer()
    outfile = open('out3', 'w')
    for line in text.splitlines():
        output = ''
        word = ''
        if line != '':
            for c in line:
                if c.isalpha():
                    word += c.lower()
                else:
                    if word:
                        word_stem = p.stem(word, 0, len(word) - 1)
                        output += word_stem
                        word_list.append(word_stem)
                        word = ''
                    output += c.lower()
        print(output, end='\n', file=outfile)
    outfile.close()
    return word_list
Ejemplo n.º 2
0
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = [
            'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'
        ]

        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)

        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)

        self.sentences = []

        self.porter = PorterStemmer()

        self.token_dict = None
        self.bins = 50
Ejemplo n.º 3
0
    def build(self, docpath, outfile):
        p = PorterStemmer()
        sw = stopwords.StopWords(self.stopword_file)

        ndx = defaultdict(list)

        for filename in os.listdir(docpath):
            if not filename.endswith(".txt"): continue

            doc_id = hash(filename.replace(".txt", ""))
            with open(os.path.join(docpath, filename)) as f:
                f_content = kwutils.normalize(f.read().lower())

            words = kwutils.tokenize(f_content)
            w_stemmed = kwutils.stem(words, p)
            w_stopped = kwutils.filter_stopwords(w_stemmed, sw)

            for word in w_stopped:
                if len(word) > 0:
                    if not doc_id in ndx[word]:
                        ndx[word].append(doc_id)

        with open(outfile, 'w') as f:
            f.write(json.dumps(ndx))
    def process_query(self, query):
        all_doc_count = len(self.invert.documents.keys())
        query_array = [x.lower() for x in query.split(' ')]
        query_weights = {}
        stopwords = []
        if self.stopword_toggle:
            stopwords = fetch_stopwords()
        while query_array:
            word = query_array.pop(0)
            frequency = 1

            for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']:
                if a in word:
                    if word.index(a) == 0 or word.index(a) == len(word) - 1:
                        word = word.replace(a, '')

            while word in query_array:
                query_array.pop(query_array.index(word))
                frequency += 1

            if self.stemming_toggle:
                p = PorterStemmer()
                word = p.stem(word, 0, len(word) - 1)

            if word in stopwords:
                continue

            term_weight = 0
            if word in self.invert.termsDictionary.keys():
                document_frequency = self.invert.termsDictionary[word]
                idf = math.log(all_doc_count / document_frequency)
                term_frequency = 1 + math.log(frequency)
                term_weight = idf * term_frequency

            query_weights[word] = term_weight
        return query_weights
Ejemplo n.º 5
0
poslineedited = []
neglinesedited = []


#there are total 6397 positives and negatives.
poslinesTrain= poslines[:3201]
neglinesTrain= neglines[:3196]

priorknowledgepo = []
priorknowledgeneg = []

priorknowledgeneg= 3196/ 6397
priorknowledgepo = 3201/ 6397


stemmer = PorterStemmer()
model = open('F:/ifa/NaiveBayes/model_file.csv', 'w',encoding="utf8")


trainset= [(x,1) for x in poslinesTrain] + [(x,-1) for x in neglinesTrain]
poswords={} #this dictionary stores counts for every word in positives
negwords={} #and negatives

for line,label in trainset: 
    words= getwords(line)

    for word in words:   
        word.lower()     
        #increment the counts for this word based on the label
        #the .get(x, 0) method returns the current count for word 
        #x, of 0 if the word is not yet in the dictionary
Ejemplo n.º 6
0
import re
from porter import PorterStemmer
p = PorterStemmer()


def lcase(text):
    return text.lower()


def prefixes(text):
    return [text[:3], text[:4], text[:5]]


def suffixes(text):
    return [text[-3:], text[-4:], text[-5:]]


def stem(text):
    if text.isalpha():
        return p.stem(text.lower(), 0, len(text) - 1)
    return text


def is_pair_of_digits(text):
    if re.match("^[0-9]{2}$", text):
        return True
    return False


def is_four_digits(text):
    if re.match("^[0-9]{4}$", text):
Ejemplo n.º 7
0
def stem(word):
    p = PorterStemmer()
    return p.stem(word, 0, len(word) - 1)
Ejemplo n.º 8
0
 def __init__(self):
     self.p = PorterStemmer()
     self.sw = stopwords.StopWords(self.stopword_file)
     self.re_tag = RE_TAG
     self.index = None
Ejemplo n.º 9
0
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t')
print("dataset imported")

import re

import nltk

nltk.download('stopwords')

# to remove stopword
from nltk.corpus import stopwords

# for Stemming propose
#from nltk.stem.porter import PorterStemmer
from porter import PorterStemmer
p = PorterStemmer()
p.stem("Alcoholic")

# Initialize empty array
# to append clean text
corpus = []
for i in range(0, 1000):
    # column : "Review", row ith
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # convert all cases to lower cases
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()

    # loop for stemming each word
    # in string array at ith row
Ejemplo n.º 10
0
 def train():  
	poslines = []
	neglines = []

	stopwords= open(r'stopwords.txt', 'r').read().splitlines()
	dataset= open('training_set.csv', 'r',encoding="utf8")

	dataset.readline()

	poslines=[]
	neglines=[]

	for data in dataset:
		data.lower()
		datalines = data.split(",")[1].strip('"').split(' ')
		DataClass = data.split(",")[0]
		#tokenizing the sentence
		if int(DataClass)==0:
			poslines.append(datalines)  
		if int(DataClass)==1:
			neglines.append(datalines)
		else:
			continue
	print( "The total positive words are:", len(poslines))
	print ("The total negative words are: ", len(neglines))

	poslineedited = []
	neglinesedited = []


	#there are total 6397 positives and negatives.
	poslinesTrain= poslines[:3201]
	neglinesTrain= neglines[:3196]

	priorknowledgepo = []
	priorknowledgeneg = []

	priorknowledgeneg= 3196/ 6397
	priorknowledgepo = 3201/ 6397


	stemmer = PorterStemmer()
	model = open('model_file.csv', 'w',encoding="utf8")


	trainset= [(x,1) for x in poslinesTrain] + [(x,-1) for x in neglinesTrain]
	poswords={} #this dictionary stores counts for every word in positives
	negwords={} #and negatives

	for line,label in trainset: 
		words= getwords(line)

		for word in words:   
			word.lower()     
			#increment the counts for this word based on the label
			#the .get(x, 0) method returns the current count for word 
			#x, of 0 if the word is not yet in the dictionary
			if label==1: poswords[word]= poswords.get(word, 0) + 1
			if label==-1: negwords[word]= negwords.get(word, 0) + 1
	positivewordlist = open(r'positive-words.txt', 'r').read().splitlines()
	negativewordlist = open(r'negative-words.txt', 'r').read().splitlines()

	#evaluate the test set
	testset= open('test_set.csv', 'r',encoding="utf8")
	testset.readline()           
	#make predictions
	output = open("prediction_file.csv", 'w')

	for line in testset:
		linesplit = line.split()
		testwords= getwords(linesplit)
		totpos, totneg= 0.0, 0.0
		for word in testwords:
			word.lower()
			
			a= poswords.get(word,0.0) + 1.0
			b= negwords.get(word,0.0) + 1.0 
			totpos+= a/(a+b)
			totneg+= b/(a+b) 
			model.write("Word: " +str(word) + ",")
			model.write("Relative positive usage: " + str(totpos)+ ",")
			model.write("Relative negative usage: "+str(totneg)+ '\n')
Ejemplo n.º 11
0
def test():
    stem = input("Was the stemmer used in the inversion? (Y/N)")
    return_times = []
    g = open("postings.txt", "r")
    content = g.read().replace('\n', ' ')
    post_list = json.loads("[" + content[:-2] + "]")
    h = open("cacm.all", "r")
    lines = h.readlines()

    if g.mode == 'r' and h.mode == 'r':
        word = ""
        while word != "zzend":
            word = input("Enter a term to search for: ").lower()
            if stem == "Y":
                p = PorterStemmer()
                word = p.stem(word, 0, len(word) - 1)

            found_word = False
            start = timer()
            for elem in post_list:
                if word == elem[0]:
                    found_word = True
                    print("\nThis term is found in " + str(len(elem[1])) +
                          " documents.")
                    print(
                        "============================================================================="
                    )
                    break
            if found_word:
                print(
                    "This search term is found in the following documents:\n")
                # output all docs that contain that term: DocID, title, TF, all the positions, first occurrence with 10
                # words
                docdata = []
                for entry in post_list:
                    if entry[0] == word:
                        docdata += entry[1]
                        break
                # docdata now has doc ID, TF, and positions for each document input_txt appears in
                # now search in cacm for word data
                count = 0
                get_title = False
                abstract_bool = False
                abstract_text = ""
                title = ""
                output = ""
                found = False
                for line in lines:
                    if count == len(docdata):
                        break
                    if line.startswith(".I " + str(docdata[count][0])):
                        found = True
                    if line == ".B\n" and found:
                        get_title = False
                        abstract_bool = False
                        found = False
                        # I need to create the output string here, as its all going to be reset now.
                        output += "Document " + str(docdata[count][0]) + " - " + title + "Term frequency: " + \
                                  str(docdata[count][1]) + "\nList of positions: " + str(docdata[count][2]) + \
                                  "\nFirst occurrence in document: " + \
                                  getcontext(title + abstract_text, docdata[count][2][0]) + "\n" + "------------" + "\n"
                        title = ""
                        abstract_text = ""
                        count += 1
                    if abstract_bool:
                        abstract_text += line
                    if line == ".W\n" and found:
                        get_title = False
                        abstract_bool = True
                    if get_title:
                        title += line
                    if line == ".T\n" and found:
                        get_title = True

                end = timer()
                elapsed_time = (end - start)
                if found_word:
                    return_times += [elapsed_time]
                print(output)
                print("Search time: " + str(elapsed_time) + " seconds\n")

                # output time to results
            elif word != "zzend":
                print("Term not found in any documents")
        shutdown(return_times)
        g.close()
        h.close()
    else:
        print("Error opening file. Try again.")
Ejemplo n.º 12
0
    def __init__(self):

        self.spec_chars_regex = re.compile('[^0-9a-zA-Z]')
        self.camel_case_regex_1 = re.compile('(.)([A-Z][a-z]+)')
        self.camel_case_regex_2 = re.compile('([a-z0-9])([A-Z])')
        self.stemmer = PorterStemmer()  # from Gupta's Porter Stemmer
Ejemplo n.º 13
0
 def __init__(self):
     self._stemmer = PorterStemmer()
Ejemplo n.º 14
0
def lookup(user_input, CLI, K):
    use_stem = False
    stop_words = False
    g = open("postings.txt", "r")
    f = open("cacm.all", "r")
    content = g.read().replace('\n', ' ')
    if content[0] == "1":
        use_stem = True
    if content[1] == "1":
        stop_words = True
    post_list = json.loads("[" + content[2:-2] + "]")
    lines = f.readlines()
    f.close()
    extracted_postings = []
    docs = []
    final_list = []

    if g.mode == 'r':
        # get query
        og_query = user_input.lower()
        og_query = re.sub('[\-]+', ' ', og_query)
        og_query = re.sub('[^A-Za-z0-9$ ]+', '', og_query)
        newquery = og_query.split()

        if stop_words:
            temp = []
            stop_words = open("stopwords.txt", "r").read().split('\n')
            for i in range(len(stop_words)):
                stop_words[i] = stop_words[i].lower()
            for word in newquery:
                if word not in stop_words:
                    temp.append(word)
            newquery = temp
        if use_stem:
            stemmed_query = ""
            for word in newquery:
                p = PorterStemmer()
                word = p.stem(word, 0, len(word) - 1)
                stemmed_query += word + " "
            newquery = stemmed_query.split()

        newquery.sort()
        term_list = get_term_lists(newquery, post_list)
        # remove duplicates if they exist
        term_list = list(dict.fromkeys(term_list))

        for entry in term_list:
            extracted_postings.append(post_list[entry])
        # get docs out of extracted postings
        for posting in extracted_postings:
            for entry in posting[1]:
                docs.append(entry[0])
        docs = list(dict.fromkeys(docs))
        docs.sort()
        document_vectors = get_doc_vector(docs, lines, use_stem, stop_words)
        # print("Relevant document vectors created. Now calculating cosine similarity")
        # now, make all of those vectors have tf values, and then weights
        cosine_list = fill_vectors(document_vectors, og_query, docs)
        temp_list = []
        for i in range(len(docs)):
            temp_list.append([docs[i], cosine_list[i]])
        temp_list.sort(key=lambda x: x[1])
        temp_list.reverse()
        if CLI:
            print("Query was: " + user_input + "\n")
            display(temp_list, get_doc_info(docs, lines))
        for elem in temp_list:
            final_list.append(elem[0])
        if K is None:
            return final_list
        else:
            return final_list[:K]
Ejemplo n.º 15
0
 def __init__(self, PATH_TO_STOP_WORDS):
     print("[Tokenizer] Instantiated!")
     self.PATH_TO_STOP_WORDS = PATH_TO_STOP_WORDS
     self.STOP_WORDS = self.load_stopwords()
     self.PorterStemmer = PorterStemmer()
Ejemplo n.º 16
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    # Load Vocabulary
    vocab = getVocabDict()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find('\n\n')
    # email_contents = email_contents[hdrstart+2:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with >
    # and does not have any < or > in the tag and replace it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr',
                            email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print '\n==== Processed Email ====\n'

    # Process file
    l = 0
    porterStemmer = PorterStemmer()
    # Tokenize and also get rid of any punctuation
    sep = '[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\},\'\"\>\_\<\;\%\n\r]+'
    for s in re.split(sep, email_contents):
        # Remove any non alphanumeric characters
        s = re.sub('[^a-zA-Z0-9]', '', s)

        # Stem the word
        s = porterStemmer.stem(s.strip())

        # Skip the word if it is too short
        if len(s) < 1:
            continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable s. You should look up s in the
        #               vocabulary dictionary (vocab). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if s = 'action', then you should
        #               add to word_indices the value under the key 'action'
        #               in vocab. For example, if vocab['action'] = 18, then,
        #               you should add 18 to the word_indices vector
        #               (e.g., word_indices.append(18) ).
        #

        # =============================================================

        # Print to screen, ensuring that the output lines are not too long
        if l + len(s) + 1 > 78:
            print
            l = 0
        print s,
        l += len(s) + 1

    # Print footer
    print '\n========================='

    return array(word_indices)