Ejemplo n.º 1
0
class Class():
	
	def __init__(self, name, content, count):
		self.name = name
		self.contentRaw = content
		self.tokens = Tokenizer(content)
		self.condProb = self.tokens.getTokens()
		self.count = count
		self.prior = 0.0

	def setPrior(self, prior):
		self.prior = prior

	def condProbs(self):
		return self.condProb

	def condProb(self, token):
		return self.condProb[token]

	def getName(self):
		return self.name

	def getTokens(self):
		return self.tokens;

	def getTokenSum(self):
		return len(self.tokens.getTokens())

	def getTokenSumIgnoreDuplicates(self):
		count = 0
		for t in self.tokens.getTokens():
			count += self.tokens.getTokens()[t]
		return count
Ejemplo n.º 2
0
	def __init__(self, text, product_name):
		self.candidate_features = []
		self.feature_sentences = []
		self.product_name = product_name.lower().split('-')[0].split('_')
		t = Tokenizer()
		sents = t.sent_tokenize(text.lower())
		p = POSTagger()
		wnl = WordNetLemmatizer()
		for sent in sents:
			tagged_sent = p.nltk_tag(t.word_tokenize(sent))
			feature_sent = {}
			feature_sent['sentence'] = sent
			feature_sent['tags'] = tagged_sent
			feature_sent['nouns'] = []
			feature_sent['noun_phrases'] = []
			for i in range(0, len(tagged_sent)):
				(word, tag) = tagged_sent[i]
				#Don't include proper nouns
				if tag.startswith('N') and tag != 'NNP':
					"""
					Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
					Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is	low. """
					if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1:
						feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word))
					else:
						feature_sent['nouns'].append(wnl.lemmatize(word))
					
			self.feature_sentences.append(feature_sent)
Ejemplo n.º 3
0
	def process(self, filePath):
		from Tokenizer import Tokenizer
		import codecs


		fileName = filePath[filePath.rindex('/') + 1:]
		tk = Tokenizer()

		ofp = codecs.open(filePath, 'r', 'utf-8')
		lines = ofp.readlines()
		ofp.close()

		if lines[0][0] == unicode(codecs.BOM_UTF8):
			lines[0] = lines[0][1:]
	

		wfp = codecs.open(self.__savePath + fileName, 'w', 'utf-8')


		for line in lines:
			tempLine = line.strip('\r\n')
			tempLine = tempLine.strip(' ')
			sentences = tempLine.split(' ')

			if sentences[0] == '':
				wfp.write('\r\n')
				continue


			for i in range(len(sentences)):
				

				tokenList = tk.ckip((sentences[i].strip(' ')))

				posList = []

				for token in tokenList:
					posList.append(len(token["term"]))


				posStr = ''


				if i == 0 and len(sentences) == 1:
					posStr = ','.join(posList)
				elif i == 0:
					posStr = ','.join(posList) + ','
				elif i == len(sentences) - 1:
					posStr = ',' + ','.join(posList)
				else:
					posStr = ',' + ','.join(posList) + ','

				wfp.write(posStr)


			wfp.write('\r\n')


		wfp.close()
		print "Chinese2POS: %s Write Done!!" % fileName
Ejemplo n.º 4
0
class ClassBank:
    def __init__(self):
        self.classes = {}
        self.documentCount = 0
        self.tokenizer = Tokenizer("")

    def addClass(self, classInst):
        self.classes[classInst.getName()] = classInst
        self.tokenizer.tokenize(classInst.contentRaw)
        self.documentCount = self.documentCount + classInst.count

    def getClass(self, name):
        if name in self.classes:
            return self.classes[name]
        return false

    def getClasses(self):
        return self.classes

    def getVocabulary(self):
        return self.tokenizer

    def getVocabularySum(self):
        return len(self.tokenizer.getTokens())

    def train(self):
        v = self.getVocabulary().getTokens()
        n = self.documentCount
        for c in self.classes:
            c = self.classes[c]
            c.setPrior(c.count / n)
            t = c.getTokens().getTokens()
            for key in c.condProb:
                c.condProb[key] = (float)(t[key] + 1) / (len(t) + v[key])
Ejemplo n.º 5
0
    def init_feature_sentences(self, total_content):
        t = Tokenizer()
        p = POSTagger()
        wnl = WordNetLemmatizer()

        sentences = t.sent_tokenize(total_content.lower())

        for sentence in sentences:
            tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence))

            #Initializing Feature Sentence dictionary
            feature_sentence = {}
            feature_sentence['sentence'] = sentence
            feature_sentence['tags'] = tagged_sentence
            feature_sentence['nouns'] = []
            feature_sentence['noun_phrases'] = []

            #Finding the Nouns/Noun Phrases in the tagged sentence
            for i in range(0,len(tagged_sentence)):
                (word, tag) = tagged_sentence[i]

                #Chunking
                if tag.startswith('N') and tag != 'NNP':
                    if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1:
                        feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word))
                    else:
                        feature_sentence['nouns'].append(wnl.lemmatize(word))

            self.feature_sentences.append(feature_sentence)
Ejemplo n.º 6
0
 def indexar(self, str):
     index = {}
     tokenizer = Tokenizer()
     tokens = tokenizer.tokenize(str)
     for term in tokens:
         index[term] = index.get(term, 0) + 1
     return index
def main():
	
	folders = {}
	folders["politik"] = "data/politik"
	folders["sport"] = "data/sport"
	folders["wirtschaft"] = "data/wirtschaft"

	bank = ClassBank()
	l = Loader()

	# train data
	for classname, folder in folders.iteritems():
		count = 0
		content = ""
		for file in os.listdir(folder + "/train/"):
			if file.endswith(".txt"):
				count = count + 1
				content = content + " " + l.load_txt(folder + "/train/" + file)
		c = Class(classname, content, count)
		bank.addClass(c)

 	bank.train()
 	c = Classifier()

 	# test data
 	for classname, folder in folders.iteritems():
 		print "\n=== Testing",classname, "===\n"
		for file in os.listdir(folder + "/test/"):
			if file.endswith(".txt"):
				tokenizer = Tokenizer(l.load_txt(folder + "/test/" + file))
				classifiedClass = c.classify(tokenizer.getTokens(), bank)
				print file,"=",classifiedClass.getName()
 def test_full_example(self):
     tokenizer = Tokenizer()
     response_builder = ResponseBuilder()
     start_time = time.time()
     tokens = tokenizer.tokenize(FULL_EXAMPLE)
     result = response_builder.process_tokens(tokens)
     end_time = time.time()
     print result
     print "Calculated in %s" % str(end_time - start_time)
Ejemplo n.º 9
0
class UnMinifier:
    constants = JSConstants()

    def __init__(self):
        self.activeKeywords = dict()
        self.tokenizer = Tokenizer()

    def unminify_file(self,filename):
        fileString = ''
        with open(filename, 'r') as fileStream:
            fileString = fileStream.read()

        #1. validate file
        if(self.checkBrackets(fileString) == False):
            return False
        #2. Extract the number of keywords by count
        if(self.xtractKeywordCounts(fileString) < 0):
            return False

        #3. Print the test tokens
        self.tokenizer.tokenize(fileString)

        #Success
        return True

    def checkBrackets(self, fileString):
        try:
            openRounds = fileString.count(self.constants.openRound)
            closeRounds = fileString.count(self.constants.closeRound)

            openCurls = fileString.count(self.constants.openCurl)
            closeCurls = fileString.count(self.constants.closeCurl)

            openSquares = fileString.count(self.constants.openSquare)
            closeSquares = fileString.count(self.constants.closeSquare)

            if(openRounds != closeRounds or openCurls != closeCurls or openSquares != closeSquares):
                raise
            else:
                print ("The brackets are good..")
        except Exception as ex:
            print ("Error:: your file misses either a ( or { or } or ), Why don't you check it??")
            return False
        #Success
        return True

    def xtractKeywordCounts(self,fileString):
        # find the number of occurrences in filestring
        try:
            for keyword in self.constants.keywordDictionary.keys():
                if(fileString.count(keyword) > 0 ):
                    self.activeKeywords.update({keyword: fileString.count(keyword)})
        except Exception as ex:
            print ("Error:: Issue with reading form fileString variable")
            return -1
        #Success
        return len(self.activeKeywords.keys())
Ejemplo n.º 10
0
    def test_basic_tokenize(self):
        str = """
S10F11
accept reply: false
asc,"test2"
        """
        tokenizer = Tokenizer()
        result = tokenizer.tokenize(str)
        self.assertEqual(len(result), 3)
Ejemplo n.º 11
0
	def __init__(self, link, pagetitle, outgoing, html):
		self.name 		= link
		self.title 		= pagetitle
		self.outLinks 	= {}
		self.incoming	= {}
		self.content	= html
		self.pageRank   = 1
		t	 			= Tokenizer(self.content) 
		self.tokens 	= t.getTokens()
		
		for ol in outgoing:
			self.addOut(ol)
Ejemplo n.º 12
0
	def __init__(self, name, content, count):
		self.name = name
		self.contentRaw = content
		self.tokens = Tokenizer(content)
		self.condProb = self.tokens.getTokens()
		self.count = count
		self.prior = 0.0
Ejemplo n.º 13
0
 def __init__(self, model_dir):
     '''
     @param model_dir: The directory containing all trained model files
     '''
     self.models = {}
     self.tokenizer = Tokenizer()
     os.path.walk(model_dir, install_all_model, self.models)
     print "All models loaded"
Ejemplo n.º 14
0
    def test_full_tokenize(self):
        format_1_string = FULL_EXAMPLE
        tokenizer = Tokenizer()
        result = tokenizer.tokenize(format_1_string)
        self.assertEqual(len(result), 38)
        self.assertEqual(result[0].token_type, TokenType.StreamAndFunction)
        self.assertEqual(result[0].token_value, [2, 3])
        self.assertEqual(result[1].token_type, TokenType.Header)
        self.assertEqual(result[1].token_value, ["accept reply", "true"])
        self.assertEqual(result[2].token_type, TokenType.ArrayDef)
        self.assertEqual(result[2].token_value, 3)
        self.assertEqual(result[3].token_type, TokenType.Value)
        res_3_value_token = ValueToken(result[3])
        res_3_string_value = """This is some text that
gets to the next line
and the next line and possibly any number of lines."""
        self.assertEqual(res_3_value_token.get_value(), res_3_string_value.replace('\n', '\\n').replace('\t', '\\t'))
Ejemplo n.º 15
0
	def __init__(self, phrase, index):
		self.tokens = Tokenizer( phrase )
		self.index = index
		self.ranking = {}
		self.lengths = {}
		self.tlength = 0
		self.calc_document_length()
		self.calc_query_length()
		self.calc_ranking()
class ClassBank():
	
	def __init__(self):
		self.classes = {}
		self.documentCount = 0
		self.tokenizer = Tokenizer("");

	def addClass(self, classInst):
		self.classes[classInst.getName()] = classInst
		self.tokenizer.tokenize(classInst.contentRaw)
		self.documentCount = self.documentCount + classInst.count

	def getClass(self, name):
		if name in self.classes:
			return self.classes[ name ]
		return false

	def getClasses(self):
		return self.classes

	def getVocabulary(self):
		return self.tokenizer

	def getVocabularySum(self):
		return len(self.tokenizer.getTokens())

	def train(self):
		v = self.getVocabulary().getTokens()
		n = self.documentCount
		for c in self.classes:
			c = self.classes[c]
			c.setPrior(c.count/n)
			t = c.getTokens().getTokens()
			tCount = 0
			for tKey, tValue in t.iteritems():
				tCount = tCount + (tValue + 1)
			for key, value in v.iteritems():
				vCount = 0
				if key in t:
					vCount = t[key]
				c.condProb[key] = (vCount + 1)/(tCount + len(v))
Ejemplo n.º 17
0
	def __init__(self, text):
		self.candidate_features = []
		self.feature_sentences = []
		t = Tokenizer()
		sents = t.sent_tokenize(text)
		p = POSTagger()
		for sent in sents:
			tagged_sent = p.nltk_tag(t.nltk_tokenize(sent))
			feature_sent = {}
			feature_sent['sentence'] = sent
			feature_sent['nouns'] = []
			feature_sent['noun_phrases'] = []
			for i in range(0, len(tagged_sent)):
				(word, tag) = tagged_sent[i]
				if tag.startswith('N') and tag != 'NNP':
					if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1]:
						feature_sent['noun_phrases'].append(feature_sent['nouns'].pop() + ' ' + word)
					else:
						feature_sent['nouns'].append(word)
					
			self.feature_sentences.append(feature_sent)
Ejemplo n.º 18
0
def main(args):
    # Checks to make sure the right number of params are given.
    # Params: filename
    if len(args)!=2:
        sys.exit("Usage: "+args[0]+" <filename>")
    inFile = args[1]

    tokenizer = Tokenizer(inFile)
    tokenizer.tokenize()
    tokenizer.toXML(inFile)
    parser = Parser(tokenizer.tokens(),VMWriter(inFile))
    parser.parse()
Ejemplo n.º 19
0
class Controler():
    '''Implement Server's APIs' logic.
    '''

    def __init__(self, model_dir):
        '''
        @param model_dir: The directory containing all trained model files
        '''
        self.models = {}
        self.tokenizer = Tokenizer()
        os.path.walk(model_dir, install_all_model, self.models)
        print "All models loaded"

    def list_model(self):
        '''Logic for Server's list_model API
        '''
        print self.models
        return {name: model.labels for name, model in self.models.items()}

    def predict(self, model_name, sentence):
        '''Logic for Server's preict API
        @param model_name: the model to be used to predict the emotion
        @param sentence: the target sentence
        @return: a list of scores, corresponding to the emotion of the selected model
        '''
        try:
            if model_name == 'YAHOO_svm':
                tokenized_us = self.tokenizer.tokenizeStr_without_timeout(sentence.encode('utf8'))[0][0]
                cleanr = re.compile('\([A-Za-z].*?\)')
                tokenized_us = re.sub(cleanr, '', tokenized_us.decode('utf8')).encode('utf8')
                sentence = ' '.join(tokenized_us.split(' '))
            else:
                sentence = str(TextBlob(sentence).translate(to='en'))
        except Exception as e:
            print(e)
            pass

        pred = self.models[model_name].predict(sentence)
        if sum(pred) == 0:
            return {'res': pred}
        else:
            return {'res': pred}
        return
Ejemplo n.º 20
0
    def __init__(self, file, fsm):
        self.steps_for_robot = {}

        # The FSM that was used to generate this experiment.
        self.fsm = fsm
        # A dictionary of dictionaries that keeps for each robot how many types each state was visited.
        self.state_counter_for_robot = {}
        # The intermediate rewards
        self.rewards = []
        # Number of robots used during this experiment
        self.number_of_robots = 0

        # Skip the first lines
        while True:
            line = peek_line(file)
            if line.startswith("[INFO]"):
                file.readline()
                continue
            else:
                break

        robot_index = -1
        state_counter_for_robot = {}

        while not peek_line(file).startswith("[INFO]"):
            line = file.readline()

            if line.startswith("Obj"):
                match = re.match(r'Obj --t (\d+) --o (\d+)', line)
                self.rewards.append(int(match.group(2)))
            elif line.startswith("--t"):
                builder = ExperimentStepBuilder()
                tokens = Tokenizer(line)
                while tokens.has_more_tokens():
                    token = tokens.next_token()

                    if token == "--t":
                        index = int(tokens.next_token())
                        builder.set_step_index(index)

                        if (index == 0):
                            state_counter_for_robot = {}
                            robot_index += 1

                        builder.set_robot_index(robot_index)
                        # set also the reward
                        builder.set_reward(self.rewards[index])
                    elif token.startswith("--s"):
                        match = re.match(r'--s(\d+)', token)
                        state_index = int(match.group(1))
                        builder.set_state_index(state_index)

                        increment_state_counter(self.state_counter_for_robot,
                                                robot_index, state_index,
                                                len(self.fsm.states))

                        # Skip the identifier
                        tokens.next_token()
                    elif token == "--n":
                        number_of_neighbours = int(tokens.next_token())
                        builder.set_number_of_neighbours(number_of_neighbours)
                    elif token == "--f":
                        ground_sensor_reading = float(tokens.next_token())
                        builder.set_ground_sensor_reading(
                            ground_sensor_reading)
                    elif token.startswith("--c"):
                        match = re.match(r'--c(\d+)', token)
                        condition = int(match.group(1))
                        builder.set_condition(condition)

                        transition_type = int(
                            tokens.next_token())  # type of the transition
                        builder.set_transition_type(transition_type)

                        value = int(tokens.next_token()
                                    )  # value (with the new code is always 1)
                        builder.set_value(value)

                        probability = float(tokens.next_token(
                        ))  # probability that the transition was active
                        builder.set_probability(probability)
                    elif token == "--a":
                        active_transitions = int(tokens.next_token())
                        builder.set_active_transition(active_transitions)

                step = builder.build()

                add(self.steps_for_robot, robot_index, step)
                self.number_of_robots = robot_index + 1

            elif line.startswith("Score"):
                match = re.match(r'Score (\d+)', line)
                self.result = int(match.group(1))
            elif len(line) == 0:
                break
            else:
                line = file.readline()
                print("Unknown line", line)
Ejemplo n.º 21
0
def tokenizer(file):
    from Tokenizer import Tokenizer
    return Tokenizer(file)
Ejemplo n.º 22
0
 def __init__(self, kernel_type):
     self.kernel = kernel_type
     self.tokenizer = Tokenizer()
     self.__init_classifier(kernel_type)
     pass
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from sklearn.model_selection import train_test_split
from Tokenizer import Tokenizer

data = pd.read_json('raw_data/data.json', lines=True)
raw_text, raw_labels = data['headline'].values[0:10000], data[
    'is_sarcastic'].values[0:10000]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(raw_text)
tokenized_headlines = tokenizer.transform(raw_text)
max_length = max([len(x) for x in tokenized_headlines])
padded_headlines = tf.keras.preprocessing.sequence.pad_sequences(
    tokenized_headlines, maxlen=max_length, padding='post')
with open('glove_embedding/tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

onehot_labels = tf.keras.utils.to_categorical(raw_labels, num_classes=2)

train_features, test_features, train_labels, test_labels = train_test_split(
    np.array(padded_headlines), np.array(onehot_labels), test_size=0.4)

np.save('processed_data/x.npy', train_features)
np.save('processed_data/y.npy', train_labels)
np.save('processed_data/test_x.npy', test_features)
np.save('processed_data/test_y.npy', test_labels)
Ejemplo n.º 24
0
'''
Created on Sep 11, 2011

@author: calvin
'''

if __name__ == '__main__':
    
    from Tokenizer import Tokenizer
    
    t = Tokenizer()
    
    types = ("ERROR", "WHITESPACE", "CONDITION", "KEYWORD",
             "IDENTIFIER", "COMMENT")
    
    f = open("../testinputs/program.bl")
    
    token = t.get_next_non_whitespace(f)
    
    while token[0] != '':
        print repr(token[0]) + " : " + types[token[1]]
        token = t.get_next_non_whitespace(f)
        
Ejemplo n.º 25
0
class MetaDataSentimentAnalisis:
    def __init__(self, kernel_type):
        self.kernel = kernel_type
        self.tokenizer = Tokenizer()
        self.__init_classifier(kernel_type)
        pass

    def __init_classifier(self, kernel_type):
        if kernel_type == 'rbf':
            self.classifier = svm.SVC(C=1, gamma=0.0000001)
        elif kernel_type == 'linear':
            self.classifier = svm.SVC(kernel='linear')
        elif kernel_type == 'liblinear':
            self.classifier = svm.LinearSVC()
        else:
            self.classifier = svm.SVC()
        self.vectorizer = TfidfVectorizer(min_df=5,
                                          max_df=0.8,
                                          sublinear_tf=True,
                                          use_idf=True)

    def train_text(self,
                   train_data_path,
                   metadata_path,
                   train=True,
                   parameters={
                       'kernel': ('linear', 'rbf'),
                       'C': [1, 10]
                   },
                   store=False,
                   storepath=""):
        parser = TweetParser()
        tweets = parser.parse(train_data_path, metadata_path)
        train_data = []
        train_labels = []
        clean_train = ""
        polarity = "NONE"
        if store and not train:
            print("store needs train ... train=True")
            train = True

        for tweet in tweets:
            clean_train = self.tokenizer.cleanText(tweet.content)
            train_data.append(clean_train)
            polarity = self.checkPolarity(tweet.polarity)
            train_labels.append(polarity)
            #print clean_train
            #print polarity
        # Create feature vectors
        train_vectors = self.vectorizer.fit_transform(train_data)
        if train:
            Cs = [0.001, 0.01, 0.1, 1, 10]
            gammas = [0.001, 0.01, 0.1, 1]
            param_grid = {'C': Cs, 'gamma': gammas}
            self.classifier = grid_search.GridSearchCV(self.classifier,
                                                       param_grid,
                                                       cv=3,
                                                       n_jobs=4,
                                                       verbose=1)
            self.classifier.fit(train_vectors, train_labels)
        if store:
            joblib.dump(self.classifier, storepath)
        return train_labels

    def svc_param_selection(self, X, y, nfolds):
        Cs = [0.001, 0.01, 0.1, 1, 10]
        gammas = [0.001, 0.01, 0.1, 1]
        param_grid = {'C': Cs, 'gamma': gammas}
        grid_search = GridSearchCV(svm.SVC(kernel='rbf'),
                                   param_grid,
                                   cv=nfolds,
                                   n_jobs=4,
                                   verbose=1)
        grid_search.fit(X, y)
        grid_search.best_params_
        return grid_search.best_params_

    def train_features(self,
                       train_data_path,
                       metadata_path,
                       features=[],
                       train=True,
                       store=True,
                       storepath=""):
        (X,
         train_labels) = self.buildFeaturesFromCorpus(train_data_path,
                                                      metadata_path, features)
        if train:
            Cs = [0.001, 0.01, 0.1, 1, 10]
            gammas = [0.001, 0.01, 0.1, 1]
            param_grid = {'C': Cs, 'gamma': gammas}
            self.classifier = grid_search.GridSearchCV(self.classifier,
                                                       param_grid,
                                                       cv=3,
                                                       n_jobs=4,
                                                       verbose=1)
            self.classifier.fit(X, train_labels)
        if store:
            joblib.dump(self.classifier, storepath)
        return train_labels

    def buildFeaturesFromCorpus(self,
                                train_data_path,
                                metadata_path,
                                features=[],
                                test=False):
        parser = TweetParser()
        tweets = parser.parse(train_data_path, metadata_path)
        train_data = []
        train_labels = []
        featuresDict = {}
        for tweet in tweets:
            if not tweet.jsonData:
                continue
            for feature in features:
                if feature not in featuresDict:
                    featuresDict[feature] = tweet_feature(
                        feature)  #creamos la feature
                if feature == "geo" or feature == "place" or feature == "favorited":
                    if feature in tweet.jsonData:
                        pass
                        #print str(type(tweet.jsonData[feature]))
                        #print tweet.jsonData[feature]
                if feature in tweet.jsonData:
                    dataValue = tweet.jsonData[feature]
                    featuresDict[feature].data.append(dataValue)
                    featuresDict[feature].count += 1
                    featuresDict[feature].totalCount += 1
                else:
                    if featuresDict[feature].type == bool:
                        featuresDict[feature].data.append(False)
                    elif featuresDict[feature].type == int:
                        featuresDict[feature].data.append(0)
                    else:
                        featuresDict[feature].data.append("null")
                    featuresDict[feature].totalCount += 1

            polarity = self.checkPolarity(tweet.polarity)
            train_labels.append(polarity)
        for feature in features:
            featuresDict[feature].print_stats()
            if feature == "text":
                Xt = self.buildFeatureDataMatrix(featuresDict[feature], test)
                train_data.append(Xt)
            else:
                train_data.append(
                    self.buildFeatureDataMatrix(featuresDict[feature], test))
        if train_data:
            #Xm = scipy.sparse.csc_matrix(train_data)
            #Xm = Xm.transpose(True)
            # X : sparse matrix, [n_samples, n_features]
            #    Tf-idf-weighted document-term matrix.
            #print Xt.shape
            #print Xm.shape
            X = scipy.sparse.hstack(train_data)
        else:
            X = Xt
        return (X, train_labels)

    def test(self, test_data_path="", metadata_path="", load=False, model=""):
        parser = TweetParser()
        tweets = parser.parse(test_data_path, metadata_path)
        test_data = []
        for tweet in tweets:
            test_data.append(self.tokenizer.cleanText(tweet.content))

        test_vectors = self.vectorizer.transform(test_data)
        if load and model:
            self.classifier = joblib.load(model)
        predictions = self.classifier.predict(test_vectors)
        return predictions

    def test_features(self,
                      test_data_path="",
                      metadata_path="",
                      features=[],
                      load=False,
                      model=""):
        (X,
         train_labels) = self.buildFeaturesFromCorpus(test_data_path,
                                                      metadata_path, features,
                                                      True)
        if load and model:
            self.classifier = joblib.load(model)
        predictions = self.classifier.predict(X)
        return (predictions, train_labels)

    def predict(self, test_data_path="", metadata_path="", model=""):
        parser = TweetParser()
        tweets = parser.parse(test_data_path, metadata_path)
        test_data = []
        for tweet in tweets:
            test_data.append(self.tokenizer.cleanText(tweet.content))

        test_vectors = self.vectorizer.transform(test_data)
        self.classifier = joblib.load(model)
        predictions = self.classifier.predict(test_vectors)
        for text, prediction in test_data, predictions:
            print text
            print prediction

    def checkPolarity(self, polarity_elements):
        polarity = 'NONE'
        if polarity_elements:
            for polarity_element in polarity_elements:
                polarity = polarity_element
                if not polarity == 'NONE':
                    break
        return polarity

    def buildFeatureDataMatrix(self, feature, test=False):
        featureData = []
        for data in feature.data:
            if feature.name == "text":
                featureData.append(self.tokenizer.cleanText(data))
            else:
                featureData.append(data)
        if feature.name == "text":
            if not test:
                text_features = self.vectorizer.fit_transform(featureData)
            else:
                text_features = self.vectorizer.transform(featureData)
            return text_features
        else:
            return scipy.sparse.csc_matrix(featureData).transpose(True)
Ejemplo n.º 26
0
import re
import os
import json
from collections import OrderedDict


dir1 = sys.argv[1] # dir for top level category classifier. prior, condprobs & config
dir2 = sys.argv[2] # dor for subcat classifier. Contains subdirs for every top level categort. Subdir have prior & condprobs
infile = open(sys.argv[3],'r') # input file \t seperated
opformat = sys.argv[4] #json or tsv
assert opformat == 'json' or opformat == 'tsv'

prior = json.load(open(os.path.join(dir1,'prior.json'),'rb'))
condprobs = json.load(open(os.path.join(dir1,'probs.json'),'rb'))
NB = NaiveBayes(prior, condprobs)
t = Tokenizer()

subcat_classifiers = {}
for k in prior.keys():
    p = json.load(open(os.path.join(dir2,re.sub('[ &]','_', k),'prior.json'),'rb'))
    c = json.load(open(os.path.join(dir2,re.sub('[ &]','_', k),'probs.json'),'rb'))
    subcat_classifiers[k] = NaiveBayes(p,c)

def unicodify(text):
    return text.encode('utf-8','ignore')

def print_line(d):
    if opformat == 'tsv':
        print "\t".join(d.values()).encode('utf-8','ignore')
    if opformat == 'json':
        print json.dumps(d)
Ejemplo n.º 27
0
    "h4": 20,
    "h5": 20,
    "h6": 20,
    "h7": 20,
    "strong": 5,
    "b": 3,
    "i": 3,
    "u": 3
}

src = "D:/UCI/CS221_IR/HW3/IR_projecct3/WEBPAGES_RAW"

folder = os.listdir(src)
buffer = Buffer()
totalDocs = 0
tokenizer = Tokenizer()
blockedList = ['script']
anchorpath = "anchor.txt"
anchortext = LoadanchorText(anchorpath)
for directory in folder:
    currDir = src + "/" + directory
    if not os.path.isdir(currDir):
        continue
    files = os.listdir(currDir)
    lines = list()
    for file in files:
        # print file
        currFile = currDir + "/" + file
        #skip for directory
        if not os.path.isfile(currFile):
            continue
Ejemplo n.º 28
0
print("Enter/Paste the content from the webpage(select all with ctrl or cmd + a and copy paste in terminal). \n Ctrl-D or Ctrl-Z ( windows ) to run program it.")
contents = []
while True:
  try:
      line = input()
  except EOFError:
      break
  contents.append(line)





inputArray = contents
tokenizer = Tokenizer(inputArray)

print("Results")
print(tokenizer.getTestResult())
print("Input")
print(tokenizer.getTestInput())
print("iterationlimit")
print(tokenizer.getIterlimit())
print("iteratoramount")
print(tokenizer.getIterator())
print("fixed modifier")
print(tokenizer.getFixed())


calculator = Calculator(tokenizer)
calculator.calculate()
Ejemplo n.º 29
0
    cnn: ".cnn",
    atlantic: ".atl",
    vice: ".vc",
    blaze: ".blz",
    federalist: ".fd",
    wsj: ".wsj",
    nyt: ".nyt",
    #huffpost: ".huf"
}

papers = [foxnews, breitbart, thinkprogress, cnn, atlantic, vice, blaze, federalist, wsj, nyt]#, huffpost]
news_pool.set(papers, threads_per_source=5)
news_pool.join()

translator = str.maketrans('', '', string.punctuation)
tk = Tokenizer()
for source in papers:
    name = source.domain.split(".")
    if name[0] == "www":
        name = name[1]
    else:
        name = name[0]
    print("beginning {} crawl".format(name))
    directory = '../articles/'+name
    if not os.path.exists(directory):
        os.makedirs(directory)
    for article in source.articles:
        article.parse()
        title = article.title.translate(translator)
        title = title.lower().replace(" ", "_")
        title += extensions[source]
Ejemplo n.º 30
0
    stopwords=mapping["stopwords"]
    document=mapping["document"]
    miniprobability=mapping["miniprobability"]
    minitogether=mapping["minitogether"]
    set_word=mapping["set_word"]
    dict_frq_word=mapping["dict_frq_word"]

word_freq_path = './data/word_freq.txt'
# char set file
common_char_path = './data/common_char_set.txt'
# same pinyin char file
same_pinyin_path = './data/same_pinyin.txt'
# custom confusion set
custom_confusion_path = './data/custom_confusion.txt'
# custom word for segment
custom_word_path = './data/custom_word.txt'
# 特定拼音词汇表
custom_pinyin_word_path="./data/custom_pinyin_word.txt"
tokenizer = Tokenizer(word_freq_path=word_freq_path,
                          common_char_path=common_char_path,
                          same_pinyin_path=same_pinyin_path,
                          custom_confusion_path=custom_confusion_path,
                          custom_word_path=custom_word_path)

pm = PMI_test(tokenizer=tokenizer, stopwords=stopwords, document=document, miniprobability=miniprobability,
              minitogether=minitogether, set_word=set_word, dict_frq_word=dict_frq_word)


print("documents read done")
print('pm.calculate_lis("小提琴","朗姆酒")', pm.calculate_lis("小提琴", "朗姆酒"))
print('pm.calculate_lis("小提琴","大钢琴")', pm.calculate_lis("小提琴", "大钢琴"))
Ejemplo n.º 31
0
 def run(code):
     Parser.tokens = Tokenizer(code)
     Parser.tokens.selectNext()
     result = Parser.parseProgram()
     # print(result)  # Prints the AST
     return result
 train['target_str'] = reduce(lambda x,y: x+y, [train[col].astype(str) for col in list_classes])
 train['target_str'] = train['target_str'].replace('110101', '000000').replace('110110','000000')
 cvlist1 = list(StratifiedKFold(n_splits=10, random_state=786).split(train, train['target_str'].astype('category')))
 cvlist2 = list(StratifiedShuffleSplit(n_splits=5, test_size=0.05, random_state=786).split(train, train['target_str'].astype('category')))
 
 #NOrmalize text
 for df in train, test:
     df["comment_text"] = normalizeString(df["comment_text"])
 #stemmer = PorterStemmer()
 #def custom_tokenize(text):
 #    tokens = wordpunct_tokenize(text)
 #    tokens = [stemmer.stem(token) for token in tokens]
 #    return tokens
     
 #Tokenize comments    S
 tok = Tokenizer(max_features=MAX_FEATURES, max_len=MAX_LEN, tokenizer=wordpunct_tokenize)
 X = tok.fit_transform(pd.concat([train["comment_text"].astype(str).fillna("na"), test["comment_text"].astype(str).fillna("na")]))
 X_train = X[:len(train), :]
 X_test = X[len(train):, :]
 
 print(X_train.shape, X_test.shape)
 print("<+++++++>")
 print("Total words found by tokenizer in train and test are {}".format(len(tok.doc_freq)))
 print("Top 10 words in vocab are {}".format(tok.doc_freq.most_common(10)))
 print("Last 10 words to be used vocab with their freq are {}".format(tok.doc_freq.most_common(MAX_FEATURES)[-10:]))
 
 #Initialize embeddings
 embedding_matrix, oov_list = initialize_embeddings(EMBEDDING_FILE, tok)
 print("<+++++++>")
 print("Size of initialized matrix is {}".format(embedding_matrix.shape))
 print("No. of words in that were not found in embedding are ".format(len(oov_list)))
Ejemplo n.º 33
0
#!/usr/bin/env python
import sys
import json
from Tokenizer import Tokenizer

t = Tokenizer()
f = open('config.json', 'r')
config = json.load(f)
f.close()
catidxmap = config['catidx']
wt = int(config['wt'])
classidx = int(config['classidx'])

def get_weight(event):
    if event == '1':
        return wt
    elif event == '12':
        return 1
    else:
        return 1

for line in sys.stdin:
    try:
        line = line.decode('utf-8','ignore').strip()
        toks = line.split('\001')
        query_toks = t.tokenize(toks[0].strip())
        event = toks[1].strip()
        orig_page_toks = t.prepend('ORIG_PAGE', toks[6].strip(), 1)
        first_hit_page_toks = t.prepend('FIRST_HIT', toks[7].strip(), 1)
        facet_value_toks = t.prepend('FACET_VALUE', toks[8].strip(), 5)
        facet_atr_toks = t.prepend('FACET_ATRS', toks[9].strip(), 5)
Ejemplo n.º 34
0
'''
Created on Sep 21, 2011

@author: calvin
'''

if __name__ == '__main__':
    from StatementParse import StatementParse
    from Tokenizer import Tokenizer
    
    t = Tokenizer()
    stmt = StatementParse()
    f = open("../testinputs/statement.bl")
    
    ttext, ttype = t.get_next_non_whitespace(f)
    
    stmt.parse(f, t, ttext, ttype)
    
    print "\n\n" + str(stmt)
Ejemplo n.º 35
0
 def __init__(self):
     self.activeKeywords = dict()
     self.tokenizer = Tokenizer()
Ejemplo n.º 36
0
class Scorer():
	
	def __init__(self, phrase, index):
		self.tokens = Tokenizer( phrase )
		self.index = index
		self.ranking = {}
		self.lengths = {}
		self.tlength = 0
		self.calc_document_length()
		self.calc_query_length()
		self.calc_ranking()

	def calc_document_length(self):
		for i in self.index.index:
			urls = self.index.index[ i ].urlList
			for d in urls.iterkeys():
				if d not in self.lengths:
					self.lengths[ d ] = 0
				self.lengths[ d ] += math.pow( self.calc_tf( urls[ d ] ) * self.calc_dtf( len( self.index.bank.urls ), i ), 2 )
		for d in self.lengths:
			self.lengths[ d ] = math.sqrt( self.lengths[ d ] )

	def calc_query_length(self):

		for t in self.tokens.getTokens():
			self.tlength += math.pow( self.calc_tf( self.get_query_term_length( t ) ) * self.calc_dtf( len( self.index.bank.urls ), t ), 2 )
		self.tlength = math.sqrt( self.tlength )

	def calc_ranking(self):
		for t in self.tokens.getTokens():
			
			it = self.index.getIndexToken( t )
			dtf = self.calc_dtf( len( self.index.bank.urls ), t )

			for d in it.urlList.iterkeys():	
				tf = self.calc_tf( it.urlList[ d ] )
				wtq = tf * dtf

				wtf = self.calc_tf( self.get_query_term_length( t ) )
				wtd = wtf * dtf
				
				if d not in self.ranking:
					self.ranking[ d ] = 0

				self.ranking[ d ] += ( wtq * wtd )

		for d in self.ranking:
			self.ranking[ d ] = self.ranking[ d ] / ( self.lengths[ d ] * self.tlength )

	def calc_tf(self, val):
		return ( 1 + math.log10( val ) )

	def calc_dtf(self, n, token):
		return math.log10( float( n ) / float( self.index.getDocumentFrequency( token ) ) )

	def get_query_term_length(self, token):
		count = 0
		for t in self.tokens.getTokens():
			if t == token:
				count = count + 1
		return count

	def printScoring(self):
		printable = "[";
		for t in self.tokens.getTokens():
			printable += "'%s', " % ( t )
		printable = printable[:-2] + "]\n"
		for item in sorted( self.ranking.items(), key=lambda x: x[1], reverse=True ):
			printable += "%s:\t%.6f\n" % (item[0], item[1] )
		print (printable)

	def printDocumentLength(self):
		printable = "";
		for item in sorted( self.lengths ):
			printable += "%s:\t%.6f\n" % ( item, self.lengths[ item ] )
		print (printable)
Ejemplo n.º 37
0
class Interpreter(object):
    def __init__(self, text):
        self.tokens = Tokenizer(text).tokenize()
        self.expr = []

    def next_expression(self):
        expr = [self.tokens.pop(0)]
        if expr[0] == '{':
            c = self.tokens.pop(0)
            self.tokens.insert(0, '{'+c)
            state['{'+c] = State.brace(int(c))
            return self.next_expression()
        elif type(state[expr[0]]) is Operator:
            for _ in range(state[expr[0]].arity):
                expr += self.next_expression()
        return expr

    def evaluate(self, new_expr=True):
        if new_expr:
            self.expr = self.next_expression()
        if len(self.expr) == 0:
            return None
        if re.fullmatch(r'while', self.expr[0]):
            return self.process_while()
        if re.fullmatch(r'for', self.expr[0]):
            return self.process_for()
        elif re.fullmatch(r'[0-9]+', self.expr[0]):
            return int(self.expr[0])
        elif type(state[self.expr[0]]) is Operator:
            return self.eval_token(self.expr.pop(0))
        return self.expr[0]

    def eval_token(self, token):
        if re.fullmatch(r'[0-9]+', token):
            return int(token)
        elif type(state[token]) is Operator:
            return state[token](*[self.eval_token(self.expr.pop(0)) for _ in range(state[token].arity)])
        return token

    def has_token(self):
        return len(self.tokens) > 0

    def process_while(self):
        a = self.next_expression()
        b = self.next_expression()
        while True:
            self.expr = a[:]
            result = self.evaluate(new_expr=False)
            if not result:
                return None
            self.expr = b[:]
            self.evaluate(new_expr=False)

    def process_for(self):
        preset = self.next_expression()
        boolean = self.next_expression()
        increment = self.next_expression()
        body = self.next_expression()
        self.expr = preset[:]
        self.evaluate(new_expr=False)
        while True:
            self.expr = boolean[:]
            result = self.evaluate(new_expr=False)
            if not result:
                return None
            self.expr = body[:]
            self.evaluate(new_expr=False)
            self.expr = increment[:]
            self.evaluate(new_expr=False)
Ejemplo n.º 38
0
 def segmentRawSentences(self, tokenizer: Tokenizer, strs: str):
     sentence = tokenizer.joinSentences(tokenizer.tokenize(strs))
     return self.segmentTokenizedString(sentence)
Ejemplo n.º 39
0
    def create_index(rootDir):
        MB_100 = 100000000
        docID = 0
        partials = []

        print('Starting...')

        # Set this to the path where you downloaded the developer JSON files
        rootDir = Path(rootDir)

        # partial index
        index = defaultdict(dict)
        partial_num = 0
        curr_size = getsizeof(index)
        # Traverse the directory tree starting at rootDir
        for dirName, subdirList, fileList in os.walk(rootDir):
            # Grab JSON files
            for fname in fileList:
                # Get the path to the JSON file:
                # e.g. C:\Users\bchau\Desktop\Projects\developer.zip\DEV\aiclub_ics_uci_edu
                path = Path(dirName).joinpath(fname)

                # Open the JSON file with above path
                with open(path) as f:
                    # Load JSON file
                    document = json.load(f)

                    stems = Tokenizer.tokenize_and_stem(document['content'])

                    # if document is low info, output url to text file and skip indexing
                    if len(stems) == 0:
                        print('Low informational value document: ' +
                              document['url'] + '\n')
                    else:
                        # stem tokens and get word frequency from the document
                        word_freq = Tokenizer.get_word_freq(stems)

                        # get term frequencies
                        token_tf = Indexer.word_freq_to_tokentf(word_freq)

                        # Convert token_tf list to indices
                        entries = Indexer.tokentf_to_postingtf(docID, token_tf)
                        # add to index and increment size counter appropriately
                        for token, postings in entries.items():
                            for docID, posting in postings.items():
                                if token not in index:
                                    curr_size += getsizeof({})
                                index[token][docID] = posting
                                curr_size += getsizeof(docID)
                                curr_size += getsizeof(posting)

                        print('Indexed w/ tf: ', docID)

                        # if index grows larger than 100 MB write it to disk
                        if curr_size > MB_100:
                            # append the name of the file that we wrote to to a list
                            partials.append(
                                Indexer.write_partial_to_disk(
                                    index, partial_num))
                            partial_num += 1
                            index.clear()
                            curr_size = getsizeof(index)

                    docID += 1

        # perform merge on partial indices
        Indexer.merge_partials(partials, get_num_docs(rootDir))

        for filename in partials:
            filePath = Path(filename)
            if filePath.exists():
                os.remove(filePath)

        print('Finished.')
Ejemplo n.º 40
0
        scores = []
        token_set = set(tokens)
        for class_label, prior in self.prior.items():
            cat_score = 0
            cat_score+=math.log(prior)
            for token in token_set:
                if token in self.condprobs:
                    cat_score+=math.log(self.condprobs[token][class_label])
            scores.append((class_label, cat_score))
        sum_exp = sum(math.exp(s[1]) for s in scores)
        norm_probs = [(class_label, math.exp(s)/sum_exp) for class_label,s in scores]
        return sorted(norm_probs, key = itemgetter(1), reverse=True)[:top_n]

#
#testing code
#

if __name__ == '__main__':
    import json
    import sys
    from Tokenizer import Tokenizer
    t = Tokenizer()
    prior = json.load(open(sys.argv[2],'r'))
    condprobs = json.load(open(sys.argv[3],'r'))
    q = sys.argv[1].decode('ascii','ignore')
    NB = NaiveBayes(prior, condprobs)
    tokens = t.tokenize(q)
    print tokens
    print NB.classify(tokens)
    print NB.score(tokens,2)
Ejemplo n.º 41
0
            if (fac == None):
                print("Error, identifier: " + identifier +
                      "is not assigned a value")
                sys.exit()
            self.pt.moveToParent()
        elif (alt == 2):
            self.pt.moveToChild(1)
            fac = self.execExp()
            self.pt.moveToParent()
        else:
            print("Error executing Fac, alt not found")
            sys.exit()
        return fac

    # returns the name of an identifiers
    def execId(self):
        self.pt.moveToChild(0)
        identifier = self.pt.getName()
        self.pt.moveToParent()
        return identifier

    # executes and returns a number from the parse tree
    def execInt(self):
        self.pt.moveToChild(0)
        num = int(self.pt.getName())
        self.pt.moveToParent()
        return num


i = Interpreter(ParseTree(Tokenizer(str(sys.argv[1]))))
Ejemplo n.º 42
0
 def __init__(self, text):
     self.tokens = Tokenizer(text).tokenize()
     self.expr = []
Ejemplo n.º 43
0
class MetaDataSentimentAnalisis:
    
    def __init__(self, kernel_type):
        self.kernel = kernel_type
        self.tokenizer = Tokenizer()
        self.__init_classifier(kernel_type)
        pass
            
    def __init_classifier(self, kernel_type):
        if kernel_type == 'rbf':
            self.classifier = svm.SVC(C=1, gamma=0.0000001)
        elif kernel_type == 'linear':
            self.classifier = svm.SVC(kernel='linear')
        elif kernel_type == 'liblinear':
            self.classifier = svm.LinearSVC()
        else:
            self.classifier = svm.SVC()
        self.vectorizer = TfidfVectorizer(min_df=5,
                                     max_df = 0.8,
                                     sublinear_tf=True,
                                     use_idf=True)
                    
        
    def train_text(self, train_data_path, metadata_path, 
              train = True,
              parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]},
              store = False,
              storepath = ""):
        parser = TweetParser() 
        tweets = parser.parse(train_data_path, metadata_path)
        train_data = []
        train_labels = []
        clean_train = ""
        polarity = "NONE"
        if store and not train:
            print ("store needs train ... train=True")
            train = True
            
        for tweet in tweets:
            clean_train = self.tokenizer.cleanText(tweet.content)
            train_data.append(clean_train)
            polarity = self.checkPolarity(tweet.polarity)
            train_labels.append(polarity)
            #print clean_train
            #print polarity
        # Create feature vectors
        train_vectors = self.vectorizer.fit_transform(train_data)
        if train:
            Cs = [0.001, 0.01, 0.1, 1, 10]
            gammas = [0.001, 0.01, 0.1, 1]
            param_grid = {'C': Cs, 'gamma' : gammas}
            self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1)
            self.classifier.fit(train_vectors, train_labels)
        if store:
            joblib.dump(self.classifier, storepath) 
        return train_labels
    
    def svc_param_selection(self,X, y, nfolds):
        Cs = [0.001, 0.01, 0.1, 1, 10]
        gammas = [0.001, 0.01, 0.1, 1]
        param_grid = {'C': Cs, 'gamma' : gammas}
        grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=4, verbose=1)
        grid_search.fit(X, y)
        grid_search.best_params_
        return grid_search.best_params_

    def train_features(self, train_data_path, metadata_path, features= [],
                        train = True, store = True, storepath = ""):
        (X, train_labels) = self.buildFeaturesFromCorpus(train_data_path, metadata_path, features)
        if train:
            Cs = [0.001, 0.01, 0.1, 1, 10]
            gammas = [0.001, 0.01, 0.1, 1]
            param_grid = {'C': Cs, 'gamma' : gammas}
            self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1)
            self.classifier.fit(X, train_labels)
        if store:
            joblib.dump(self.classifier, storepath)
        return train_labels
             
    def buildFeaturesFromCorpus(self,train_data_path, metadata_path, features= [],
                                 test = False):
        parser = TweetParser() 
        tweets = parser.parse(train_data_path, metadata_path)
        train_data = []
        train_labels = []
        featuresDict ={}
        for tweet in tweets:
            if not tweet.jsonData:
                continue
            for feature in features:
                if feature not in featuresDict:
                    featuresDict[feature] = tweet_feature(feature)#creamos la feature
                if feature == "geo" or feature == "place" or feature == "favorited":
                        if feature in tweet.jsonData:
                            pass
                            #print str(type(tweet.jsonData[feature]))
                            #print tweet.jsonData[feature]
                if feature in tweet.jsonData:
                    dataValue = tweet.jsonData[feature]
                    featuresDict[feature].data.append(dataValue)
                    featuresDict[feature].count +=1
                    featuresDict[feature].totalCount += 1
                else:
                    if featuresDict[feature].type == bool: 
                        featuresDict[feature].data.append(False)
                    elif featuresDict[feature].type == int:
                        featuresDict[feature].data.append(0)
                    else:
                        featuresDict[feature].data.append("null")
                    featuresDict[feature].totalCount += 1
                    
            polarity = self.checkPolarity(tweet.polarity)
            train_labels.append(polarity)   
        for feature in features:
            featuresDict[feature].print_stats()
            if feature == "text":
                Xt = self.buildFeatureDataMatrix(featuresDict[feature], test)
                train_data.append(Xt)
            else:     
                train_data.append(self.buildFeatureDataMatrix(featuresDict[feature], test))
        if train_data:                  
            #Xm = scipy.sparse.csc_matrix(train_data)
            #Xm = Xm.transpose(True)     
            # X : sparse matrix, [n_samples, n_features]
            #    Tf-idf-weighted document-term matrix.    
            #print Xt.shape
            #print Xm.shape
            X = scipy.sparse.hstack(train_data)
        else:
            X = Xt    
        return (X,train_labels)
                
    def test(self, test_data_path = "", metadata_path = "", load = False, model = ""):
        parser = TweetParser() 
        tweets = parser.parse(test_data_path, metadata_path)
        test_data = []
        for tweet in tweets:
            test_data.append(self.tokenizer.cleanText(tweet.content))
            
        test_vectors = self.vectorizer.transform(test_data)
        if load and model:
            self.classifier = joblib.load(model) 
        predictions = self.classifier.predict(test_vectors)
        return predictions
    
    def test_features(self, test_data_path = "", metadata_path = "", features= [], load = False, model = ""):
        (X, train_labels) = self.buildFeaturesFromCorpus(test_data_path, metadata_path, features, True)  
        if load and model:
            self.classifier = joblib.load(model) 
        predictions = self.classifier.predict(X)
        return (predictions, train_labels)
    
    def predict(self, test_data_path = "", metadata_path = "", model = ""):  
        parser = TweetParser() 
        tweets = parser.parse(test_data_path, metadata_path)
        test_data = []
        for tweet in tweets:
            test_data.append(self.tokenizer.cleanText(tweet.content))  
            
        test_vectors = self.vectorizer.transform(test_data)
        self.classifier = joblib.load(model) 
        predictions = self.classifier.predict(test_vectors)   
        for text, prediction in test_data, predictions:
            print text
            print prediction
    
    def checkPolarity(self, polarity_elements):
        polarity = 'NONE'
        if polarity_elements:
            for polarity_element in polarity_elements:
                polarity = polarity_element
                if not polarity == 'NONE':
                    break 
        return polarity
    
    def buildFeatureDataMatrix(self, feature, test = False):
        featureData = []
        for data in feature.data:
            if feature.name == "text":
                featureData.append(self.tokenizer.cleanText(data))
            else:    
                featureData.append(data)
        if feature.name == "text":
            if not test:
                text_features = self.vectorizer.fit_transform(featureData)
            else:
                text_features=self.vectorizer.transform(featureData)       
            return text_features
        else:
            return scipy.sparse.csc_matrix(featureData).transpose(True)
Ejemplo n.º 44
0
class Parser:
    def __init__(self, input_content, file=True, debug=False):
        self.tokenizer = Tokenizer(input_content, file)
        self.tokenizer.tokenize()
        self.token = None
        self.is_debug = debug
        self.pos = 0
        self.info = []
        self.debug = []
        self.eps = False

    def take_token(self, token_type_list, eps=False):
        self.eps = False
        self.token = self.next_token()
        if self.is_debug:
            print(self.token)
        if self.token.type not in token_type_list:
            if eps:
                self.token = self.prev_token()
                self.eps = True
                if self.is_debug:
                    print(self.eps)
                return 'EPS'
            else:
                self.raise_error(
                    "Unexpected token: %s at '%s' on line %d column %d; Expected one of %s"
                    % (self.token.type, self.token.value, self.token.line,
                       self.token.column, str(token_type_list)))
        return self.token

    def take_tokens(self, token_type_list):
        for token_type in token_type_list:
            self.take_token(token_type)

    def prev_token(self):
        self.pos -= 1
        return self.tokenizer.tokens[self.pos - 1]

    def next_token(self):
        self.pos += 1
        return self.tokenizer.tokens[self.pos - 1]

    @staticmethod
    def raise_error(msg='Unexpected error occurred'):
        raise RuntimeError('Parser error; %s' % msg)

    def log_info(self, stmt):
        self.info.append(stmt + ' - OK')
        if self.is_debug:
            print(self.info[-1])

    def log_debug(self, stmt):
        self.debug.append(stmt)
        if self.is_debug:
            print(stmt)

    def reset(self):
        self.pos = 0
        self.info = []
        self.debug = []
        self.eps = False

    def parse(self):
        self.reset()
        self.start()
        return

    def start(self):
        # self.next_token()
        self.x_elems()
        return

    # noinspection SpellCheckingInspection
    def x_elems(self):
        self.log_debug('x_elems')
        self.take_token(['ANCHOR_OP', 'VAR_NAME', 'EOF'])
        if self.token.type == 'EOF':
            return
        else:
            self.x_elem()
        self.x_elems()

    def x_elem(self):
        self.log_debug('x_elem')
        if self.token.type == 'ANCHOR_OP':
            self.x_style()
        elif self.token.type == 'VAR_NAME':
            self.x_var_def()
        else:
            self.raise_error()

    def x_style(self):
        self.log_debug('x_style')
        self.x_items()
        self.log_info('style')

    def x_items(self):
        self.log_debug('x_items')
        eps = self.take_token(
            ['VAR_NAME', 'STRING', 'ANCHOR_OP', 'ANCHOR_CLOSE'], eps=True)
        if eps == 'EPS':
            return
        elif self.token.type == 'VAR_NAME':
            self.x_var_def()
        elif self.token.type == 'STRING':
            self.take_token(['COLON'])
            self.x_value()
            self.x_properties()
            self.take_token(['SEMICOLON'])
        elif self.token.type == 'ANCHOR_OP':
            self.x_style()
        elif self.token.type == 'ANCHOR_CLOSE':
            return
        else:
            self.raise_error()
        self.log_info('items')
        self.x_items()

    def x_properties(self):
        self.log_debug('x_properties')
        self.x_value(True)
        if self.eps:
            return
        else:
            self.x_properties()

    def x_var_def(self):
        self.log_debug('x_var_def')
        self.take_token(['COLON'])
        self.x_value()
        self.take_token(['SEMICOLON'])
        self.log_info('var_def')

    def x_value(self, eps_mode=False):
        self.log_debug('x_value')
        eps = self.take_token(
            ['CONSTANT', 'STRING', 'VAR_NAME', 'FUNCTION_OP'], eps_mode)
        if eps_mode and eps == 'EPS':
            pass
        elif self.token.type == 'CONSTANT':
            pass
        elif self.token.type == 'STRING':
            pass
        elif self.token.type == 'FUNCTION_OP':
            self.x_values()
            self.take_token(['BRACKET_CLOSE'])
        elif self.token.type == 'VAR_NAME':
            pass
        else:
            self.raise_error()

    def x_values(self):
        self.log_debug('x_values')
        self.x_value(True)
        eps = self.take_token(['COMMA'], True)
        if eps == 'EPS':
            pass
        else:
            self.x_values()
class DataPrepper():
    def __init__(self, PATH_TO_STOP_WORDS, PATH_TO_TRAIN_LIST):
        self.PATH_TO_STOP_WORDS = PATH_TO_STOP_WORDS
        self.PATH_TO_CLASS_LIST = PATH_TO_TRAIN_LIST
        self.Tokenizer = Tokenizer(self.PATH_TO_STOP_WORDS)

        # Set up class-specific constants
        self.fpc = self.load_paths_to_training_text(
        )  # F.P.C means filename_path_classnames
        self.class_names = self.get_class_names()

        print("[DataPrepper] Instantiated!")

    """
  Processes the dataset and returns the feature vectors of each of the training
  and test sets (positively and negatively classified)

  Note:
    train_pos_doc_map = datasets[0][0]
    train_neg_doc_map = datasets[0][1]
    test_pos_doc_map = datasets[1][0]
    test_neg_doc_map = datasets[1][1]
  """

    def run(self, class_name, cross_validation_mode=False):
        print("[DataPrepper] Running for", class_name,
              ", prepping datasets...")

        datasets = None
        if cross_validation_mode:
            datasets = self.prep_dataset(class_name, 1.0, 1.0)
        else:
            datasets = self.prep_dataset(class_name, 0.8, 0.9)

        print(
            "Sample sizes - Train: %d positives + %d negatives, Test: %d positives + %d negatives"
            % (len(datasets[0][0]), len(datasets[0][1]), len(
                datasets[1][0]), len(datasets[1][1])))

        # Text normalization: tokenization, stop word removal & stemming
        print("[DataPrepper] Tokenizing datasets...")
        datasets_df_pair = self.tokenize_datasets(datasets)
        datasets = datasets_df_pair[0]
        doc_freq_map = datasets_df_pair[1]

        # Construct df from datasets
        doc_freq_map = self.cull_doc_freq(doc_freq_map, 50)
        print("Num of words in vocabs: Vocab=%d" % len(doc_freq_map.keys()))
        print("Num of words in vocabs: Culled Vocab=%d" %
              len(doc_freq_map.keys()))

        N_docs = len(datasets[0][0]) + len(datasets[0][1]) + len(
            datasets[1][0]) + len(datasets[1][1])
        datasets = self.setup_tfidf_vector(N_docs, datasets, doc_freq_map)

        # === FOR DEBUGGING ===
        # tryA = datasets[0][0][list(datasets[0][0].keys())[0]]
        # tryB = datasets[0][1][list(datasets[0][1].keys())[0]]
        # tryC = datasets[1][0][list(datasets[1][0].keys())[0]]
        # tryD = datasets[1][1][list(datasets[1][1].keys())[0]]
        # print('---SEE WHAT FEATURE VECTORS LOOK LIKE FOR %s---' % class_name)
        # print('try A:', tryA, 'dim:', len(tryA))
        # print('try B:', tryB, 'dim:', len(tryB))
        # print('try C:', tryC, 'dim:', len(tryC))
        # print('try D:', tryD, 'dim:', len(tryD))
        # print('---END SEE WHAT FEATURE VECTORS LOOK LIKE---')

        f_vector_pos_train = self.setup_feature_vectors_for_classifier(
            datasets[0][0])
        f_vector_neg_train = self.setup_feature_vectors_for_classifier(
            datasets[0][1])
        f_vector_pos_test = []
        f_vector_neg_test = []
        if cross_validation_mode:
            f_vector_pos_test = self.setup_feature_vectors_for_classifier(
                datasets[1][0])
            f_vector_neg_test = self.setup_feature_vectors_for_classifier(
                datasets[1][1])

        return [[f_vector_pos_train, f_vector_neg_train],
                [f_vector_pos_test, f_vector_neg_test], doc_freq_map]

    #===========================================================================#
    # TEXT NORMALIZATION
    # Functions to facilitate text normalization for all datasets
    #===========================================================================#
    def tokenize_datasets_OLD(self, datasets):
        for i in range(len(datasets)):
            for j in range(len(datasets[i])):
                dict_class_documents = datasets[i][j]

                for doc_name in dict_class_documents.keys():
                    dict_class_documents[doc_name] = \
                      self.Tokenizer.tokenize(dict_class_documents[doc_name])
        return datasets

    def tokenize_datasets(self, datasets):
        doc_freq_map = {}

        for i in range(len(datasets)):
            for j in range(len(datasets[i])):
                dict_class_documents = datasets[i][j]

                for doc_name in dict_class_documents.keys():
                    dict_class_documents[doc_name] = self.Tokenizer.tokenize(
                        dict_class_documents[doc_name])

                    # Construct doc freq map on-the-fly
                    tokens_processed_before = []
                    for token in dict_class_documents[doc_name]:
                        if token not in tokens_processed_before:  # unique tokens in a doc
                            tokens_processed_before.append(token)
                            if token not in doc_freq_map.keys(
                            ):  # if token is newly found, initialize
                                doc_freq_map[token] = [doc_name]
                            else:
                                doc_freq_map[token].append(
                                    doc_name
                                )  # since the word appears in this doc

        return [datasets, doc_freq_map]

    #===========================================================================#
    # TF-IDF VECTORIZATION
    # Compute TF-IDF vectors for every document
    #===========================================================================#
    def setup_tfidf_vector(self, NUM_DOCS, datasets, doc_freq_map):
        vocab = list(doc_freq_map.keys())

        for i in range(len(datasets)):
            for j in range(len(datasets[i])):
                dict_class_documents = datasets[i][j]

                for doc_name in dict_class_documents.keys():
                    doc = dict_class_documents[doc_name]
                    f_vector = [0] * len(vocab)

                    for token in doc:
                        if token in vocab:
                            tf = doc.count(token)
                            log_tf = (1 + log(tf)) if tf > 0 else 0.0
                            log_idf = log(NUM_DOCS / len(doc_freq_map[token]))
                            w = log_tf * log_idf
                            f_vector[vocab.index(token)] = w

                    dict_class_documents[doc_name] = f_vector

        return datasets

    def cull_doc_freq(self, doc_freq_map, threshold_num_docs):
        culled_df_map = {}
        for word in doc_freq_map.keys():
            if len(doc_freq_map[word]) > threshold_num_docs:
                culled_df_map[word] = doc_freq_map[word]
        return culled_df_map

    #===========================================================================#
    # CONSTRUCT VOCABULARY & DOC FREQ MAP
    # Set up data structures that hold the vocab and doc freq of every word
    #===========================================================================#
    def setup_vocab(self, dataset, threshold):
        count_vocab = {}
        vocab = []
        for doc_name in dataset.keys():
            for token in dataset[doc_name]:
                if token not in count_vocab.keys():
                    count_vocab[token] = 0
                else:
                    count_vocab[token] += 1

                if token not in vocab and count_vocab[token] >= threshold:
                    vocab.append(token)

        return vocab

    """
  Sets up the doc frequency of words in a given dataset.
  A dataset is a dictionary of this format: { 'doc_name' :  ['Here', 'are', ...] }

  Returns a dictionary containing the document frequency of all words in the
  chosen dataset in this format: { 'Here' : 12, 'are' : 56 ... }
  """

    def setup_doc_freq(self, dataset):
        df = {}

        for doc_name in dataset.keys():
            for word in dataset[doc_name]:
                if word not in df.keys():
                    df[word] = [doc_name]
                else:
                    if doc_name not in df[word]:
                        df[word].append(doc_name)

        return df

    def get_chisq_vocab(self, data_pos_vocab, data_neg_vocab, docs_pos,
                        docs_neg, threshold):
        combined_vocabs = self.union_vocabs(data_pos_vocab, data_neg_vocab)
        N_pos_docs = len(docs_pos.keys())
        N_neg_docs = len(docs_neg.keys())

        feature_selected_vocab = []
        for word in (combined_vocabs):
            N_pos_docs_containing_word = self.get_num_contains_word(
                docs_pos, word)
            N_pos_docs_not_containing_word = N_pos_docs - N_pos_docs_containing_word

            N_neg_docs_containing_word = self.get_num_contains_word(
                docs_neg, word)
            N_neg_docs_not_containing_word = N_neg_docs - N_neg_docs_containing_word

            # no. of training docs that:
            N_00 = N_neg_docs_not_containing_word  #  in negative class, do not contain w
            N_01 = N_pos_docs_not_containing_word  #  in positive class, do not contain w
            N_10 = N_neg_docs_containing_word  #  in negative class,        contain w
            N_11 = N_pos_docs_containing_word  #  in positive class,        contain w

            chisq = 0
            if not (N_00 == 0 and N_01 == 0):
                chisq = ((N_11 + N_10 + N_01 + N_00) * pow(N_11 * N_00 - N_10 * N_01, 2)) / \
                        ((N_11 + N_01) * (N_11 + N_10) * (N_10 + N_00) * (N_01 + N_00))

            if chisq > threshold:
                feature_selected_vocab.append(word)

        return feature_selected_vocab

    def get_num_contains_word(self, df, word):
        docs_containing_word = []
        for doc_name in df.keys():
            if word in df[doc_name]:
                docs_containing_word.append(doc_name)
        return len(docs_containing_word)

    def union_vocabs(self, vocab_1, vocab_2):
        unioned_vocab = []
        for word in vocab_1:
            if word not in unioned_vocab:
                unioned_vocab.append(word)
        for word in vocab_2:
            if word not in unioned_vocab:
                unioned_vocab.append(word)
        return unioned_vocab

    #===========================================================================#
    # CONSTRUCT FEATURE VECTORS FOR EACH CLASS
    # Compute feature vectors representing each class' text document
    #===========================================================================#
    def setup_feature_vectors(self, vocab, dataset):
        fea_datasets = []
        dataset_f_vectors = []

        for doc_name in dataset.keys():
            doc = dataset[doc_name]
            DOC_N = len(doc)
            f_vector = [0] * len(vocab)

            # Count word occurrence with reference to vocab
            for word in doc:
                if word in vocab:
                    f_vector[vocab.index(word)] += 1

            # Normalize by the number of words in a document
            for k in range(len(f_vector)):
                f_vector[k] = f_vector[k] / DOC_N

            # Finished processing a feature vector of a doc
            dataset_f_vectors.append(f_vector)

        return dataset_f_vectors

    """
  Stack map of {'doc_name': [1.81, 0, 6.8...] ... } into a list of feature vectors
  """

    def setup_feature_vectors_for_classifier(self, doc_tfidf_vector_map):
        f_vectors = []
        for doc_name in doc_tfidf_vector_map.keys():
            f_vectors.append(doc_tfidf_vector_map[doc_name])
        return f_vectors

    #===========================================================================#
    # CONSTRUCT THE DATASET
    # Retrieves texts from training and test files
    #===========================================================================#
    """
  Prepares the datasets we will need for training and testing.
  Splits our corpus into positive and negative train/test sets.

  Returns a list of 2 pairs of tuples - one for train & test set, where each
  tuple contains 2 dictionaries - one for positives & negatives
  """

    def prep_dataset(self, positive_class_name, pos_frac, neg_frac_per_class):
        positives_fpc = self.get_texts_for_class(positive_class_name)
        N_pos_docs = len(positives_fpc)

        negatives_fpc_map = {}
        N_neg_docs = 0

        # Set up a dictionary containing { 'neg_class_name': [['53886', 'path_to_doc', 'c2'], [...] ...] }
        for class_name in self.class_names:
            if not (class_name == positive_class_name):
                negatives_fpc_map[class_name] = self.get_texts_for_class(
                    class_name)
                N_neg_docs += 1

        # Split the positive classes into train and test sets
        N_pos_train = int(N_pos_docs * pos_frac)
        N_pos_test = int(N_pos_docs * (1 - pos_frac))

        positives = self.sample_N_pos_texts(positives_fpc, N_pos_train)
        train_positives = positives[0]
        test_positives = positives[1]

        # Sample and split the negatives classes into train and test sets
        negatives = self.sample_N_neg_texts(negatives_fpc_map,
                                            neg_frac_per_class)
        train_negatives = negatives[0]
        test_negatives = negatives[1]

        return [[train_positives, train_negatives],
                [test_positives, test_negatives]]

    """
  Reads the train-class-list or test-class-list file to retrieve all the
  paths to each document

  Returns a list of 3-tuples in the format:
    [[doc_name, path_to_doc, class_name], ...]
  """

    def load_paths_to_training_text(self):
        filepath_class_file = open(self.PATH_TO_CLASS_LIST, 'r')
        filepath_class_lines = filepath_class_file.readlines()

        filename_path_classnames = []
        for ln in filepath_class_lines:
            filepath_class_pair = self.Tokenizer.split_on_whitespace_from_back(
                ln)
            filename = self.Tokenizer.split_on_slash_from_back(
                filepath_class_pair[0])[1]
            filepath_class_pair[1] = self.Tokenizer.strip_newline(
                filepath_class_pair[1])

            result = []
            result.append(filename)
            result.append(filepath_class_pair[0])
            result.append(filepath_class_pair[1])
            filename_path_classnames.append(result)

        return filename_path_classnames

    """
  Gets the list of all the class names in our corpus

  Returns a list of [String] class names
  """

    def get_class_names(self):
        result = []
        for filename_path_classname in self.fpc:
            candidate_class_name = filename_path_classname[2]
            if candidate_class_name not in result:
                result.append(candidate_class_name)
        return result

    """
  Gets a list of filenames classified as `class_name`

  Returns a list of up to LIMIT (optional) 3-tuples in the format:
    [[doc_name, path_to_doc, class_name], ...]
  for the specified class_name
  """

    def get_texts_for_class(self, class_name, LIMIT=None):
        result = []
        for filename_path_classname in self.fpc:
            if filename_path_classname[2] == class_name:
                if LIMIT != None and len(result) > LIMIT:
                    break
                else:
                    result.append(filename_path_classname)
        return result

    """
  Retrieves the first N texts from a positive class

  Returns a tuple of a
    1.) dictionary of N positive training entries,
    2.) dictionary of N positive testing entries the format:

    [
      { '[doc_name]' : 'some long string of text...' ... },
      { '[doc_name]' : 'some long string of text...' ... }
    ]
  """

    def sample_N_pos_texts(self, pos_fpc, N):
        result_train = {}
        result_test = {}
        count = 0

        # Obtain the documents from each class specified in class_names
        # First N documents are sent for training, the remaining are sent for testing
        for fpc in pos_fpc:
            doc_name = fpc[0]
            path_to_doc = fpc[1]
            class_name = fpc[2]

            f = open(path_to_doc, 'r', encoding='latin1')
            if count < N:
                result_train[doc_name] = f.read()
                count += 1
            else:
                result_test[doc_name] = f.read()

        return (result_train, result_test)

    """
  Retrieves the first N / len(negative_classes) texts from each of the
  specified list of negative classes

  Returns a tuple of a
    1.) dictionary of N negative training entries,
    2.) dictionary of N negative testing entries the format:

    [
      { '[doc_name]' : 'some long string of text...' ... },
      { '[doc_name]' : 'some long string of text...' ... }
    ]
  """

    def sample_N_neg_texts(self, negatives_fpc_map, neg_frac_per_class):
        negative_classes = negatives_fpc_map.keys()
        neg_train_map = {}
        neg_test_map = {}

        for class_name in negative_classes:
            N_docs = len(negatives_fpc_map[class_name])
            N_train = int(N_docs * neg_frac_per_class)

            for i in range(N_docs):
                # Retrieve elements in fpc 3-tuple
                doc_tuple = negatives_fpc_map[class_name][i]
                doc_name = doc_tuple[0]
                path_to_doc = doc_tuple[1]
                class_name = doc_tuple[2]

                f = open(path_to_doc, 'r', encoding='latin1')

                if i < N_train:
                    neg_train_map[doc_name] = f.read()
                else:
                    neg_test_map[doc_name] = f.read()

        return (neg_train_map, neg_test_map)
Ejemplo n.º 46
0
 def __init__(self, kernel_type):
     self.kernel = kernel_type
     self.tokenizer = Tokenizer()
     self.__init_classifier(kernel_type)
     pass
Ejemplo n.º 47
0
#-------------------------------

import torch
from torch.utils.data import DataLoader
from MyDataSet import MyDataSet
from Tokenizer import Tokenizer
from pad import pad
from Config import Config
from Seq2Seq import Seq2Seq

if __name__ == '__main__':
    source_path = '../data/test/source.txt'
    target_path = '../data/test/target.txt'
    vocab_path = '../data/vocab.txt'
    model_path = '../model/model.pth'
    tokenizer = Tokenizer(vocab_path)
    config = Config()
    fr = open('../result/test.txt', 'w', encoding='utf-8-sig')  # 存储预测结果

    loader = DataLoader(dataset=MyDataSet(source_path, target_path, tokenizer),
                        batch_size=config.batch_size,
                        shuffle=True,
                        num_workers=2,
                        collate_fn=pad,
                        drop_last=False)  # 最后一个batch数据集不丢弃
    device = torch.device('cpu')
    model = Seq2Seq(config)
    model.to(device)
    # 加载模型
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint['model'])
Ejemplo n.º 48
0
            object = utils.getObject(wordtags, size, i)
            firedNode = self.findFiredNode(object)
            if firedNode.depth > 0:
                if firedNode.conclusion == "B":
                    sb = sb + " " + wordtags[i].form
                else:
                    sb = sb + "_" + wordtags[i].form
            else:
                if wordtags[i].tag == "B":
                    sb = sb + " " + wordtags[i].form
                else:
                    sb = sb + "_" + wordtags[i].form
        return sb.strip()

    # def segmentRawString(self,strs:str)->str:
    #     return self.segmentTokenizedString(" ".join(Tokenizer.tokenize(strs)))
    def segmentRawSentences(self, tokenizer: Tokenizer, strs: str):
        sentence = tokenizer.joinSentences(tokenizer.tokenize(strs))
        return self.segmentTokenizedString(sentence)


if __name__ == "__main__":
    rdrsegment = RDRSegmenter()
    tokenizer = Tokenizer()
    t = time.time()
    output = rdrsegment.segmentRawSentences(
        tokenizer,
        "hôm nay tôi đau bụng cảm sốt nhức đầu ho khan tại Hà Nội có triệu chứng bị Covid 19"
    )
    print(output, time.time() - t)
Ejemplo n.º 49
0
'''
Created on Nov 2, 2011

@author: Calvin
'''

if __name__ == '__main__':
    from ProgramParse import ProgramParse
    from Tokenizer import Tokenizer

    t = Tokenizer()
    prog = ProgramParse()
    f = open("../testinputs/program.bl")

    prog.parse(f, t)

    print "\n\n" + str(prog)
	def __init__(self):
		self.classes = {}
		self.documentCount = 0
		self.tokenizer = Tokenizer("");
Ejemplo n.º 51
0
def main(argv):
    # construct our pipeline list reading from command line args
    # still need to figure out best way to pass parameters on command
    # line

    global verbose
    global norm
    split = None

    transforms = []
    for arg in argv[0].split(","):
        if arg == "toke":
            transforms.append(Tokenizer())
        elif arg == "stem":
            transforms.append(Stemmer())
        elif arg == "stem-porter":
            transforms.append(Stemmer(mode='Porter'))
        elif arg == "stem-lancaster":
            transforms.append(Stemmer(mode='Lancaster'))
        elif arg == "stem-lemmatize":
            transforms.append(Stemmer(mode='Lemmatize'))
        elif arg == "vect":
            transforms.append(Vectorizer())
        elif arg == "vect-tfidf":
            transforms.append(Vectorizer(mode='TFIDF'))
        elif arg == "vect-count":
            transforms.append(Vectorizer(mode='Count'))
        elif arg == "vect-lda-2":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=2))
        elif arg == "vect-lda-10":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=10))
        elif arg == "vect-lda-25":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=25))
        elif arg == "vect-lda-50":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=50))
        elif arg == "vect-lda-150":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=150))
        elif arg == "vect-lda-500":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=500))
        elif arg == "svm":
            transforms.append(Model('svm'))
        elif arg == "nb":
            transforms.append(Model('nb'))
        elif arg == "lr":
            transforms.append(Model('lr'))
        elif arg == "nn":
            transforms.append(
                Model('nn', inputDim=10000)
            )  #Configured for Vectorizer with vectors limited to 1000
        elif arg == "norm":
            norm = True
        elif arg == "no-verb":
            verbose = False
        elif arg == "split-sentences":
            split = "sentences"
        elif arg == "nn-optim":
            # Memory optimized neural network.
            transforms.append(
                OptimNN(vecMode='TFIDF', epochs=2, batchSize=2048))
        else:
            raise Exception(f"Invalid transformer {arg}")
    pipe = Pipeline(transforms, norm=norm)

    # read our data (hardcoded for now)
    df0 = pd.read_pickle(
        "./data/democrat_comments.pkl")  #.sample(frac = 0.05) # DEBUG ONLY
    df1 = pd.read_pickle(
        "./data/republican_comments.pkl")  #.sample(frac = 0.05) # DEBUG ONLY

    if (split is not None):
        if (verbose):
            print('Splitting Democrat comments')
        df0 = splitRows(df0, mode=split, verbose=verbose)

        if (verbose):
            print('Splitting Republican comments')
        df1 = splitRows(df1, mode=split, verbose=verbose)

    label0 = df0.subreddit.iloc[0]
    label1 = df1.subreddit.iloc[0]

    # concatenate and clean our data
    X = pd.concat([df0.body, df1.body], ignore_index=True)
    y = pd.concat([df0.subreddit, df1.subreddit],
                  ignore_index=True).replace(to_replace=[label0, label1],
                                             value=[0, 1])

    # split into training and test
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    if (verbose):
        print('Applying Transforms and Training Model')
        print('Train Data:', train_path)
        print('Test Data:', test_path)
        print('Transforms:', argv[0])
    # fit our data
    pipe.fit_transform(X_train, y_train)

    # do the prediction
    y_pred = pipe.predict(X_test)
    results = pipe.validate(y_pred, y_test, True, True)

    # get most suprising misclassifications for class 0
    print("Most suprising texts misclassified as class 0")
    idx_list = heapq.nlargest(5, results[2][0], key=lambda x: x[1])
    for i, (idx, prob) in enumerate(idx_list):
        print(f"{i}) probability class 1 = {prob}\n{X_test[idx]}, \n")

    # get most suprising misclassifications for class 1
    print("Most suprising texts misclassified as class 1")
    idx_list = heapq.nlargest(5, results[2][1], key=lambda x: x[1])
    for i, (idx, prob) in enumerate(idx_list):
        print(f"{i}) probability class 0 = {prob}\n{X_test[idx]}\n")
Ejemplo n.º 52
0
def build_data(input_file):
    tokenizer = Tokenizer(input_file)
    rows = tokenizer.next_int()
    cols = tokenizer.next_int()
    num_cars = tokenizer.next_int()
    num_rides = tokenizer.next_int()
    bonus = tokenizer.next_int()
    total_steps = tokenizer.next_int()
    tokenizer.next_line()
    # building all rides
    rides = []
    for i in range(num_rides):
        a = tokenizer.next_int()
        b = tokenizer.next_int()
        x = tokenizer.next_int()
        y = tokenizer.next_int()
        s = tokenizer.next_int()
        f = tokenizer.next_int()
        tokenizer.next_line()
        rides.append(Ride(a, b, x, y, s, f, i))
    cars = []
    for i in range(num_cars):
        cars.append(Car())
    return rides, cars, bonus, num_rides, num_cars
Ejemplo n.º 53
0
sources = glob.glob(
    "./{}/*.jack".format(in_file)) if os.path.isdir(in_file) else [in_file]
for source in sources:
    base_name = source[:-len(".jack")]
    in_file = source
    tokenizer_outfile = "{}T.xml".format(base_name)
    compilation_engine_outfile = "{}.xml".format(base_name)

    with open(tokenizer_outfile, 'w') as tokenizer_file_out:
        tokenizer_xml_writer = XMLWriter(tokenizer_file_out)

        tokenizer_xml_writer.open_tag('tokens')

        with open(in_file, 'rb') as f_in:
            tokenizer = Tokenizer(f_in)

            while True:
                try:
                    tokenizer_xml_writer.write_token(tokenizer.advance())
                except TokenizerReachedEndOfFileException:
                    print('Reached end')
                    break

        tokenizer_xml_writer.close_tag('tokens')

    with open(compilation_engine_outfile, 'w') as ce_file_out:
        ce_xml_writer = XMLWriter(ce_file_out)

        with open(in_file, 'rb') as f_in:
            tokenizer = Tokenizer(f_in)
Ejemplo n.º 54
0
from sequence_mask_loss import sequence_mask_loss
from Config import Config
from Seq2Seq import Seq2Seq
from rouge import Rouge
from convert_to_RougePattern import convert_to_RougePattern
import random

if __name__ == '__main__':
    source_path = '../data/train/source.txt'
    target_path = '../data/train/target.txt'
    eval_source_path = '../data/eval/source.txt'
    eval_target_path = '../data/eval/target.txt'
    vocab_path = '../data/vocab.txt'
    log_path = '../log/log.txt'
    log = open(log_path, 'w', encoding='utf-8')
    tokenizer = Tokenizer(vocab_path)
    config = Config()
    rouge = Rouge()  # 评估指标

    # 训练集
    loader = DataLoader(dataset=MyDataSet(source_path, target_path, tokenizer),
                        batch_size=config.batch_size,
                        shuffle=True,
                        num_workers=0,
                        collate_fn=pad,
                        drop_last=False)  # 最后一个batch数据集不丢弃
    # 评估集
    eval_loader = DataLoader(dataset=MyDataSet(eval_source_path,
                                               eval_target_path, tokenizer),
                             batch_size=config.batch_size,
                             shuffle=True,
Ejemplo n.º 55
0
            return self.exp.EvalExp()


# Reading the file and creating a list of words
programfile = sys.argv[1]
filename = sys.argv[2]
infile = open(filename, "r")

tail = []
for line in infile:
    for string in line.split():
        tail.append(string)
infile.close()

# Opening output file for writing
nameAndFormat = filename.split(".txt")
outfilename = "testoutput.txt".join(nameAndFormat)
outfile = open(outfilename, "w")
prettyprint = open(programfile, "w")

# Creating a tokenizer
t = Tokenizer(tail)

Program = Prog()
Program.ParseProg()
Program.PrintProg(prettyprint)
Program.ExecProg()

prettyprint.close()
outfile.close()
Ejemplo n.º 56
0
 def analyzeFile(self, filename):
   tokenizer = Tokenizer(filename)
   tokenizer.tokenize()
   compiler = CompilationEngine(filename)
   compiler.compile()
   return
 def __init__(self):
   print("[Tester] instantiated!")
   self.Tokenizer = Tokenizer(PATH_TO_STOP_WORDS)