class Class(): def __init__(self, name, content, count): self.name = name self.contentRaw = content self.tokens = Tokenizer(content) self.condProb = self.tokens.getTokens() self.count = count self.prior = 0.0 def setPrior(self, prior): self.prior = prior def condProbs(self): return self.condProb def condProb(self, token): return self.condProb[token] def getName(self): return self.name def getTokens(self): return self.tokens; def getTokenSum(self): return len(self.tokens.getTokens()) def getTokenSumIgnoreDuplicates(self): count = 0 for t in self.tokens.getTokens(): count += self.tokens.getTokens()[t] return count
def __init__(self, text, product_name): self.candidate_features = [] self.feature_sentences = [] self.product_name = product_name.lower().split('-')[0].split('_') t = Tokenizer() sents = t.sent_tokenize(text.lower()) p = POSTagger() wnl = WordNetLemmatizer() for sent in sents: tagged_sent = p.nltk_tag(t.word_tokenize(sent)) feature_sent = {} feature_sent['sentence'] = sent feature_sent['tags'] = tagged_sent feature_sent['nouns'] = [] feature_sent['noun_phrases'] = [] for i in range(0, len(tagged_sent)): (word, tag) = tagged_sent[i] #Don't include proper nouns if tag.startswith('N') and tag != 'NNP': """ Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase. Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is low. """ if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1: feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word)) else: feature_sent['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sent)
def process(self, filePath): from Tokenizer import Tokenizer import codecs fileName = filePath[filePath.rindex('/') + 1:] tk = Tokenizer() ofp = codecs.open(filePath, 'r', 'utf-8') lines = ofp.readlines() ofp.close() if lines[0][0] == unicode(codecs.BOM_UTF8): lines[0] = lines[0][1:] wfp = codecs.open(self.__savePath + fileName, 'w', 'utf-8') for line in lines: tempLine = line.strip('\r\n') tempLine = tempLine.strip(' ') sentences = tempLine.split(' ') if sentences[0] == '': wfp.write('\r\n') continue for i in range(len(sentences)): tokenList = tk.ckip((sentences[i].strip(' '))) posList = [] for token in tokenList: posList.append(len(token["term"])) posStr = '' if i == 0 and len(sentences) == 1: posStr = ','.join(posList) elif i == 0: posStr = ','.join(posList) + ',' elif i == len(sentences) - 1: posStr = ',' + ','.join(posList) else: posStr = ',' + ','.join(posList) + ',' wfp.write(posStr) wfp.write('\r\n') wfp.close() print "Chinese2POS: %s Write Done!!" % fileName
class ClassBank: def __init__(self): self.classes = {} self.documentCount = 0 self.tokenizer = Tokenizer("") def addClass(self, classInst): self.classes[classInst.getName()] = classInst self.tokenizer.tokenize(classInst.contentRaw) self.documentCount = self.documentCount + classInst.count def getClass(self, name): if name in self.classes: return self.classes[name] return false def getClasses(self): return self.classes def getVocabulary(self): return self.tokenizer def getVocabularySum(self): return len(self.tokenizer.getTokens()) def train(self): v = self.getVocabulary().getTokens() n = self.documentCount for c in self.classes: c = self.classes[c] c.setPrior(c.count / n) t = c.getTokens().getTokens() for key in c.condProb: c.condProb[key] = (float)(t[key] + 1) / (len(t) + v[key])
def init_feature_sentences(self, total_content): t = Tokenizer() p = POSTagger() wnl = WordNetLemmatizer() sentences = t.sent_tokenize(total_content.lower()) for sentence in sentences: tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence)) #Initializing Feature Sentence dictionary feature_sentence = {} feature_sentence['sentence'] = sentence feature_sentence['tags'] = tagged_sentence feature_sentence['nouns'] = [] feature_sentence['noun_phrases'] = [] #Finding the Nouns/Noun Phrases in the tagged sentence for i in range(0,len(tagged_sentence)): (word, tag) = tagged_sentence[i] #Chunking if tag.startswith('N') and tag != 'NNP': if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1: feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word)) else: feature_sentence['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sentence)
def indexar(self, str): index = {} tokenizer = Tokenizer() tokens = tokenizer.tokenize(str) for term in tokens: index[term] = index.get(term, 0) + 1 return index
def main(): folders = {} folders["politik"] = "data/politik" folders["sport"] = "data/sport" folders["wirtschaft"] = "data/wirtschaft" bank = ClassBank() l = Loader() # train data for classname, folder in folders.iteritems(): count = 0 content = "" for file in os.listdir(folder + "/train/"): if file.endswith(".txt"): count = count + 1 content = content + " " + l.load_txt(folder + "/train/" + file) c = Class(classname, content, count) bank.addClass(c) bank.train() c = Classifier() # test data for classname, folder in folders.iteritems(): print "\n=== Testing",classname, "===\n" for file in os.listdir(folder + "/test/"): if file.endswith(".txt"): tokenizer = Tokenizer(l.load_txt(folder + "/test/" + file)) classifiedClass = c.classify(tokenizer.getTokens(), bank) print file,"=",classifiedClass.getName()
def test_full_example(self): tokenizer = Tokenizer() response_builder = ResponseBuilder() start_time = time.time() tokens = tokenizer.tokenize(FULL_EXAMPLE) result = response_builder.process_tokens(tokens) end_time = time.time() print result print "Calculated in %s" % str(end_time - start_time)
class UnMinifier: constants = JSConstants() def __init__(self): self.activeKeywords = dict() self.tokenizer = Tokenizer() def unminify_file(self,filename): fileString = '' with open(filename, 'r') as fileStream: fileString = fileStream.read() #1. validate file if(self.checkBrackets(fileString) == False): return False #2. Extract the number of keywords by count if(self.xtractKeywordCounts(fileString) < 0): return False #3. Print the test tokens self.tokenizer.tokenize(fileString) #Success return True def checkBrackets(self, fileString): try: openRounds = fileString.count(self.constants.openRound) closeRounds = fileString.count(self.constants.closeRound) openCurls = fileString.count(self.constants.openCurl) closeCurls = fileString.count(self.constants.closeCurl) openSquares = fileString.count(self.constants.openSquare) closeSquares = fileString.count(self.constants.closeSquare) if(openRounds != closeRounds or openCurls != closeCurls or openSquares != closeSquares): raise else: print ("The brackets are good..") except Exception as ex: print ("Error:: your file misses either a ( or { or } or ), Why don't you check it??") return False #Success return True def xtractKeywordCounts(self,fileString): # find the number of occurrences in filestring try: for keyword in self.constants.keywordDictionary.keys(): if(fileString.count(keyword) > 0 ): self.activeKeywords.update({keyword: fileString.count(keyword)}) except Exception as ex: print ("Error:: Issue with reading form fileString variable") return -1 #Success return len(self.activeKeywords.keys())
def test_basic_tokenize(self): str = """ S10F11 accept reply: false asc,"test2" """ tokenizer = Tokenizer() result = tokenizer.tokenize(str) self.assertEqual(len(result), 3)
def __init__(self, link, pagetitle, outgoing, html): self.name = link self.title = pagetitle self.outLinks = {} self.incoming = {} self.content = html self.pageRank = 1 t = Tokenizer(self.content) self.tokens = t.getTokens() for ol in outgoing: self.addOut(ol)
def __init__(self, name, content, count): self.name = name self.contentRaw = content self.tokens = Tokenizer(content) self.condProb = self.tokens.getTokens() self.count = count self.prior = 0.0
def __init__(self, model_dir): ''' @param model_dir: The directory containing all trained model files ''' self.models = {} self.tokenizer = Tokenizer() os.path.walk(model_dir, install_all_model, self.models) print "All models loaded"
def test_full_tokenize(self): format_1_string = FULL_EXAMPLE tokenizer = Tokenizer() result = tokenizer.tokenize(format_1_string) self.assertEqual(len(result), 38) self.assertEqual(result[0].token_type, TokenType.StreamAndFunction) self.assertEqual(result[0].token_value, [2, 3]) self.assertEqual(result[1].token_type, TokenType.Header) self.assertEqual(result[1].token_value, ["accept reply", "true"]) self.assertEqual(result[2].token_type, TokenType.ArrayDef) self.assertEqual(result[2].token_value, 3) self.assertEqual(result[3].token_type, TokenType.Value) res_3_value_token = ValueToken(result[3]) res_3_string_value = """This is some text that gets to the next line and the next line and possibly any number of lines.""" self.assertEqual(res_3_value_token.get_value(), res_3_string_value.replace('\n', '\\n').replace('\t', '\\t'))
def __init__(self, phrase, index): self.tokens = Tokenizer( phrase ) self.index = index self.ranking = {} self.lengths = {} self.tlength = 0 self.calc_document_length() self.calc_query_length() self.calc_ranking()
class ClassBank(): def __init__(self): self.classes = {} self.documentCount = 0 self.tokenizer = Tokenizer(""); def addClass(self, classInst): self.classes[classInst.getName()] = classInst self.tokenizer.tokenize(classInst.contentRaw) self.documentCount = self.documentCount + classInst.count def getClass(self, name): if name in self.classes: return self.classes[ name ] return false def getClasses(self): return self.classes def getVocabulary(self): return self.tokenizer def getVocabularySum(self): return len(self.tokenizer.getTokens()) def train(self): v = self.getVocabulary().getTokens() n = self.documentCount for c in self.classes: c = self.classes[c] c.setPrior(c.count/n) t = c.getTokens().getTokens() tCount = 0 for tKey, tValue in t.iteritems(): tCount = tCount + (tValue + 1) for key, value in v.iteritems(): vCount = 0 if key in t: vCount = t[key] c.condProb[key] = (vCount + 1)/(tCount + len(v))
def __init__(self, text): self.candidate_features = [] self.feature_sentences = [] t = Tokenizer() sents = t.sent_tokenize(text) p = POSTagger() for sent in sents: tagged_sent = p.nltk_tag(t.nltk_tokenize(sent)) feature_sent = {} feature_sent['sentence'] = sent feature_sent['nouns'] = [] feature_sent['noun_phrases'] = [] for i in range(0, len(tagged_sent)): (word, tag) = tagged_sent[i] if tag.startswith('N') and tag != 'NNP': if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1]: feature_sent['noun_phrases'].append(feature_sent['nouns'].pop() + ' ' + word) else: feature_sent['nouns'].append(word) self.feature_sentences.append(feature_sent)
def main(args): # Checks to make sure the right number of params are given. # Params: filename if len(args)!=2: sys.exit("Usage: "+args[0]+" <filename>") inFile = args[1] tokenizer = Tokenizer(inFile) tokenizer.tokenize() tokenizer.toXML(inFile) parser = Parser(tokenizer.tokens(),VMWriter(inFile)) parser.parse()
class Controler(): '''Implement Server's APIs' logic. ''' def __init__(self, model_dir): ''' @param model_dir: The directory containing all trained model files ''' self.models = {} self.tokenizer = Tokenizer() os.path.walk(model_dir, install_all_model, self.models) print "All models loaded" def list_model(self): '''Logic for Server's list_model API ''' print self.models return {name: model.labels for name, model in self.models.items()} def predict(self, model_name, sentence): '''Logic for Server's preict API @param model_name: the model to be used to predict the emotion @param sentence: the target sentence @return: a list of scores, corresponding to the emotion of the selected model ''' try: if model_name == 'YAHOO_svm': tokenized_us = self.tokenizer.tokenizeStr_without_timeout(sentence.encode('utf8'))[0][0] cleanr = re.compile('\([A-Za-z].*?\)') tokenized_us = re.sub(cleanr, '', tokenized_us.decode('utf8')).encode('utf8') sentence = ' '.join(tokenized_us.split(' ')) else: sentence = str(TextBlob(sentence).translate(to='en')) except Exception as e: print(e) pass pred = self.models[model_name].predict(sentence) if sum(pred) == 0: return {'res': pred} else: return {'res': pred} return
def __init__(self, file, fsm): self.steps_for_robot = {} # The FSM that was used to generate this experiment. self.fsm = fsm # A dictionary of dictionaries that keeps for each robot how many types each state was visited. self.state_counter_for_robot = {} # The intermediate rewards self.rewards = [] # Number of robots used during this experiment self.number_of_robots = 0 # Skip the first lines while True: line = peek_line(file) if line.startswith("[INFO]"): file.readline() continue else: break robot_index = -1 state_counter_for_robot = {} while not peek_line(file).startswith("[INFO]"): line = file.readline() if line.startswith("Obj"): match = re.match(r'Obj --t (\d+) --o (\d+)', line) self.rewards.append(int(match.group(2))) elif line.startswith("--t"): builder = ExperimentStepBuilder() tokens = Tokenizer(line) while tokens.has_more_tokens(): token = tokens.next_token() if token == "--t": index = int(tokens.next_token()) builder.set_step_index(index) if (index == 0): state_counter_for_robot = {} robot_index += 1 builder.set_robot_index(robot_index) # set also the reward builder.set_reward(self.rewards[index]) elif token.startswith("--s"): match = re.match(r'--s(\d+)', token) state_index = int(match.group(1)) builder.set_state_index(state_index) increment_state_counter(self.state_counter_for_robot, robot_index, state_index, len(self.fsm.states)) # Skip the identifier tokens.next_token() elif token == "--n": number_of_neighbours = int(tokens.next_token()) builder.set_number_of_neighbours(number_of_neighbours) elif token == "--f": ground_sensor_reading = float(tokens.next_token()) builder.set_ground_sensor_reading( ground_sensor_reading) elif token.startswith("--c"): match = re.match(r'--c(\d+)', token) condition = int(match.group(1)) builder.set_condition(condition) transition_type = int( tokens.next_token()) # type of the transition builder.set_transition_type(transition_type) value = int(tokens.next_token() ) # value (with the new code is always 1) builder.set_value(value) probability = float(tokens.next_token( )) # probability that the transition was active builder.set_probability(probability) elif token == "--a": active_transitions = int(tokens.next_token()) builder.set_active_transition(active_transitions) step = builder.build() add(self.steps_for_robot, robot_index, step) self.number_of_robots = robot_index + 1 elif line.startswith("Score"): match = re.match(r'Score (\d+)', line) self.result = int(match.group(1)) elif len(line) == 0: break else: line = file.readline() print("Unknown line", line)
def tokenizer(file): from Tokenizer import Tokenizer return Tokenizer(file)
def __init__(self, kernel_type): self.kernel = kernel_type self.tokenizer = Tokenizer() self.__init_classifier(kernel_type) pass
import pandas as pd import numpy as np import tensorflow as tf import pickle from sklearn.model_selection import train_test_split from Tokenizer import Tokenizer data = pd.read_json('raw_data/data.json', lines=True) raw_text, raw_labels = data['headline'].values[0:10000], data[ 'is_sarcastic'].values[0:10000] tokenizer = Tokenizer() tokenizer.fit_on_texts(raw_text) tokenized_headlines = tokenizer.transform(raw_text) max_length = max([len(x) for x in tokenized_headlines]) padded_headlines = tf.keras.preprocessing.sequence.pad_sequences( tokenized_headlines, maxlen=max_length, padding='post') with open('glove_embedding/tokenizer.pkl', 'wb') as file: pickle.dump(tokenizer, file) onehot_labels = tf.keras.utils.to_categorical(raw_labels, num_classes=2) train_features, test_features, train_labels, test_labels = train_test_split( np.array(padded_headlines), np.array(onehot_labels), test_size=0.4) np.save('processed_data/x.npy', train_features) np.save('processed_data/y.npy', train_labels) np.save('processed_data/test_x.npy', test_features) np.save('processed_data/test_y.npy', test_labels)
''' Created on Sep 11, 2011 @author: calvin ''' if __name__ == '__main__': from Tokenizer import Tokenizer t = Tokenizer() types = ("ERROR", "WHITESPACE", "CONDITION", "KEYWORD", "IDENTIFIER", "COMMENT") f = open("../testinputs/program.bl") token = t.get_next_non_whitespace(f) while token[0] != '': print repr(token[0]) + " : " + types[token[1]] token = t.get_next_non_whitespace(f)
class MetaDataSentimentAnalisis: def __init__(self, kernel_type): self.kernel = kernel_type self.tokenizer = Tokenizer() self.__init_classifier(kernel_type) pass def __init_classifier(self, kernel_type): if kernel_type == 'rbf': self.classifier = svm.SVC(C=1, gamma=0.0000001) elif kernel_type == 'linear': self.classifier = svm.SVC(kernel='linear') elif kernel_type == 'liblinear': self.classifier = svm.LinearSVC() else: self.classifier = svm.SVC() self.vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, sublinear_tf=True, use_idf=True) def train_text(self, train_data_path, metadata_path, train=True, parameters={ 'kernel': ('linear', 'rbf'), 'C': [1, 10] }, store=False, storepath=""): parser = TweetParser() tweets = parser.parse(train_data_path, metadata_path) train_data = [] train_labels = [] clean_train = "" polarity = "NONE" if store and not train: print("store needs train ... train=True") train = True for tweet in tweets: clean_train = self.tokenizer.cleanText(tweet.content) train_data.append(clean_train) polarity = self.checkPolarity(tweet.polarity) train_labels.append(polarity) #print clean_train #print polarity # Create feature vectors train_vectors = self.vectorizer.fit_transform(train_data) if train: Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma': gammas} self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1) self.classifier.fit(train_vectors, train_labels) if store: joblib.dump(self.classifier, storepath) return train_labels def svc_param_selection(self, X, y, nfolds): Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma': gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=4, verbose=1) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_ def train_features(self, train_data_path, metadata_path, features=[], train=True, store=True, storepath=""): (X, train_labels) = self.buildFeaturesFromCorpus(train_data_path, metadata_path, features) if train: Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma': gammas} self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1) self.classifier.fit(X, train_labels) if store: joblib.dump(self.classifier, storepath) return train_labels def buildFeaturesFromCorpus(self, train_data_path, metadata_path, features=[], test=False): parser = TweetParser() tweets = parser.parse(train_data_path, metadata_path) train_data = [] train_labels = [] featuresDict = {} for tweet in tweets: if not tweet.jsonData: continue for feature in features: if feature not in featuresDict: featuresDict[feature] = tweet_feature( feature) #creamos la feature if feature == "geo" or feature == "place" or feature == "favorited": if feature in tweet.jsonData: pass #print str(type(tweet.jsonData[feature])) #print tweet.jsonData[feature] if feature in tweet.jsonData: dataValue = tweet.jsonData[feature] featuresDict[feature].data.append(dataValue) featuresDict[feature].count += 1 featuresDict[feature].totalCount += 1 else: if featuresDict[feature].type == bool: featuresDict[feature].data.append(False) elif featuresDict[feature].type == int: featuresDict[feature].data.append(0) else: featuresDict[feature].data.append("null") featuresDict[feature].totalCount += 1 polarity = self.checkPolarity(tweet.polarity) train_labels.append(polarity) for feature in features: featuresDict[feature].print_stats() if feature == "text": Xt = self.buildFeatureDataMatrix(featuresDict[feature], test) train_data.append(Xt) else: train_data.append( self.buildFeatureDataMatrix(featuresDict[feature], test)) if train_data: #Xm = scipy.sparse.csc_matrix(train_data) #Xm = Xm.transpose(True) # X : sparse matrix, [n_samples, n_features] # Tf-idf-weighted document-term matrix. #print Xt.shape #print Xm.shape X = scipy.sparse.hstack(train_data) else: X = Xt return (X, train_labels) def test(self, test_data_path="", metadata_path="", load=False, model=""): parser = TweetParser() tweets = parser.parse(test_data_path, metadata_path) test_data = [] for tweet in tweets: test_data.append(self.tokenizer.cleanText(tweet.content)) test_vectors = self.vectorizer.transform(test_data) if load and model: self.classifier = joblib.load(model) predictions = self.classifier.predict(test_vectors) return predictions def test_features(self, test_data_path="", metadata_path="", features=[], load=False, model=""): (X, train_labels) = self.buildFeaturesFromCorpus(test_data_path, metadata_path, features, True) if load and model: self.classifier = joblib.load(model) predictions = self.classifier.predict(X) return (predictions, train_labels) def predict(self, test_data_path="", metadata_path="", model=""): parser = TweetParser() tweets = parser.parse(test_data_path, metadata_path) test_data = [] for tweet in tweets: test_data.append(self.tokenizer.cleanText(tweet.content)) test_vectors = self.vectorizer.transform(test_data) self.classifier = joblib.load(model) predictions = self.classifier.predict(test_vectors) for text, prediction in test_data, predictions: print text print prediction def checkPolarity(self, polarity_elements): polarity = 'NONE' if polarity_elements: for polarity_element in polarity_elements: polarity = polarity_element if not polarity == 'NONE': break return polarity def buildFeatureDataMatrix(self, feature, test=False): featureData = [] for data in feature.data: if feature.name == "text": featureData.append(self.tokenizer.cleanText(data)) else: featureData.append(data) if feature.name == "text": if not test: text_features = self.vectorizer.fit_transform(featureData) else: text_features = self.vectorizer.transform(featureData) return text_features else: return scipy.sparse.csc_matrix(featureData).transpose(True)
import re import os import json from collections import OrderedDict dir1 = sys.argv[1] # dir for top level category classifier. prior, condprobs & config dir2 = sys.argv[2] # dor for subcat classifier. Contains subdirs for every top level categort. Subdir have prior & condprobs infile = open(sys.argv[3],'r') # input file \t seperated opformat = sys.argv[4] #json or tsv assert opformat == 'json' or opformat == 'tsv' prior = json.load(open(os.path.join(dir1,'prior.json'),'rb')) condprobs = json.load(open(os.path.join(dir1,'probs.json'),'rb')) NB = NaiveBayes(prior, condprobs) t = Tokenizer() subcat_classifiers = {} for k in prior.keys(): p = json.load(open(os.path.join(dir2,re.sub('[ &]','_', k),'prior.json'),'rb')) c = json.load(open(os.path.join(dir2,re.sub('[ &]','_', k),'probs.json'),'rb')) subcat_classifiers[k] = NaiveBayes(p,c) def unicodify(text): return text.encode('utf-8','ignore') def print_line(d): if opformat == 'tsv': print "\t".join(d.values()).encode('utf-8','ignore') if opformat == 'json': print json.dumps(d)
"h4": 20, "h5": 20, "h6": 20, "h7": 20, "strong": 5, "b": 3, "i": 3, "u": 3 } src = "D:/UCI/CS221_IR/HW3/IR_projecct3/WEBPAGES_RAW" folder = os.listdir(src) buffer = Buffer() totalDocs = 0 tokenizer = Tokenizer() blockedList = ['script'] anchorpath = "anchor.txt" anchortext = LoadanchorText(anchorpath) for directory in folder: currDir = src + "/" + directory if not os.path.isdir(currDir): continue files = os.listdir(currDir) lines = list() for file in files: # print file currFile = currDir + "/" + file #skip for directory if not os.path.isfile(currFile): continue
print("Enter/Paste the content from the webpage(select all with ctrl or cmd + a and copy paste in terminal). \n Ctrl-D or Ctrl-Z ( windows ) to run program it.") contents = [] while True: try: line = input() except EOFError: break contents.append(line) inputArray = contents tokenizer = Tokenizer(inputArray) print("Results") print(tokenizer.getTestResult()) print("Input") print(tokenizer.getTestInput()) print("iterationlimit") print(tokenizer.getIterlimit()) print("iteratoramount") print(tokenizer.getIterator()) print("fixed modifier") print(tokenizer.getFixed()) calculator = Calculator(tokenizer) calculator.calculate()
cnn: ".cnn", atlantic: ".atl", vice: ".vc", blaze: ".blz", federalist: ".fd", wsj: ".wsj", nyt: ".nyt", #huffpost: ".huf" } papers = [foxnews, breitbart, thinkprogress, cnn, atlantic, vice, blaze, federalist, wsj, nyt]#, huffpost] news_pool.set(papers, threads_per_source=5) news_pool.join() translator = str.maketrans('', '', string.punctuation) tk = Tokenizer() for source in papers: name = source.domain.split(".") if name[0] == "www": name = name[1] else: name = name[0] print("beginning {} crawl".format(name)) directory = '../articles/'+name if not os.path.exists(directory): os.makedirs(directory) for article in source.articles: article.parse() title = article.title.translate(translator) title = title.lower().replace(" ", "_") title += extensions[source]
stopwords=mapping["stopwords"] document=mapping["document"] miniprobability=mapping["miniprobability"] minitogether=mapping["minitogether"] set_word=mapping["set_word"] dict_frq_word=mapping["dict_frq_word"] word_freq_path = './data/word_freq.txt' # char set file common_char_path = './data/common_char_set.txt' # same pinyin char file same_pinyin_path = './data/same_pinyin.txt' # custom confusion set custom_confusion_path = './data/custom_confusion.txt' # custom word for segment custom_word_path = './data/custom_word.txt' # 特定拼音词汇表 custom_pinyin_word_path="./data/custom_pinyin_word.txt" tokenizer = Tokenizer(word_freq_path=word_freq_path, common_char_path=common_char_path, same_pinyin_path=same_pinyin_path, custom_confusion_path=custom_confusion_path, custom_word_path=custom_word_path) pm = PMI_test(tokenizer=tokenizer, stopwords=stopwords, document=document, miniprobability=miniprobability, minitogether=minitogether, set_word=set_word, dict_frq_word=dict_frq_word) print("documents read done") print('pm.calculate_lis("小提琴","朗姆酒")', pm.calculate_lis("小提琴", "朗姆酒")) print('pm.calculate_lis("小提琴","大钢琴")', pm.calculate_lis("小提琴", "大钢琴"))
def run(code): Parser.tokens = Tokenizer(code) Parser.tokens.selectNext() result = Parser.parseProgram() # print(result) # Prints the AST return result
train['target_str'] = reduce(lambda x,y: x+y, [train[col].astype(str) for col in list_classes]) train['target_str'] = train['target_str'].replace('110101', '000000').replace('110110','000000') cvlist1 = list(StratifiedKFold(n_splits=10, random_state=786).split(train, train['target_str'].astype('category'))) cvlist2 = list(StratifiedShuffleSplit(n_splits=5, test_size=0.05, random_state=786).split(train, train['target_str'].astype('category'))) #NOrmalize text for df in train, test: df["comment_text"] = normalizeString(df["comment_text"]) #stemmer = PorterStemmer() #def custom_tokenize(text): # tokens = wordpunct_tokenize(text) # tokens = [stemmer.stem(token) for token in tokens] # return tokens #Tokenize comments S tok = Tokenizer(max_features=MAX_FEATURES, max_len=MAX_LEN, tokenizer=wordpunct_tokenize) X = tok.fit_transform(pd.concat([train["comment_text"].astype(str).fillna("na"), test["comment_text"].astype(str).fillna("na")])) X_train = X[:len(train), :] X_test = X[len(train):, :] print(X_train.shape, X_test.shape) print("<+++++++>") print("Total words found by tokenizer in train and test are {}".format(len(tok.doc_freq))) print("Top 10 words in vocab are {}".format(tok.doc_freq.most_common(10))) print("Last 10 words to be used vocab with their freq are {}".format(tok.doc_freq.most_common(MAX_FEATURES)[-10:])) #Initialize embeddings embedding_matrix, oov_list = initialize_embeddings(EMBEDDING_FILE, tok) print("<+++++++>") print("Size of initialized matrix is {}".format(embedding_matrix.shape)) print("No. of words in that were not found in embedding are ".format(len(oov_list)))
#!/usr/bin/env python import sys import json from Tokenizer import Tokenizer t = Tokenizer() f = open('config.json', 'r') config = json.load(f) f.close() catidxmap = config['catidx'] wt = int(config['wt']) classidx = int(config['classidx']) def get_weight(event): if event == '1': return wt elif event == '12': return 1 else: return 1 for line in sys.stdin: try: line = line.decode('utf-8','ignore').strip() toks = line.split('\001') query_toks = t.tokenize(toks[0].strip()) event = toks[1].strip() orig_page_toks = t.prepend('ORIG_PAGE', toks[6].strip(), 1) first_hit_page_toks = t.prepend('FIRST_HIT', toks[7].strip(), 1) facet_value_toks = t.prepend('FACET_VALUE', toks[8].strip(), 5) facet_atr_toks = t.prepend('FACET_ATRS', toks[9].strip(), 5)
''' Created on Sep 21, 2011 @author: calvin ''' if __name__ == '__main__': from StatementParse import StatementParse from Tokenizer import Tokenizer t = Tokenizer() stmt = StatementParse() f = open("../testinputs/statement.bl") ttext, ttype = t.get_next_non_whitespace(f) stmt.parse(f, t, ttext, ttype) print "\n\n" + str(stmt)
def __init__(self): self.activeKeywords = dict() self.tokenizer = Tokenizer()
class Scorer(): def __init__(self, phrase, index): self.tokens = Tokenizer( phrase ) self.index = index self.ranking = {} self.lengths = {} self.tlength = 0 self.calc_document_length() self.calc_query_length() self.calc_ranking() def calc_document_length(self): for i in self.index.index: urls = self.index.index[ i ].urlList for d in urls.iterkeys(): if d not in self.lengths: self.lengths[ d ] = 0 self.lengths[ d ] += math.pow( self.calc_tf( urls[ d ] ) * self.calc_dtf( len( self.index.bank.urls ), i ), 2 ) for d in self.lengths: self.lengths[ d ] = math.sqrt( self.lengths[ d ] ) def calc_query_length(self): for t in self.tokens.getTokens(): self.tlength += math.pow( self.calc_tf( self.get_query_term_length( t ) ) * self.calc_dtf( len( self.index.bank.urls ), t ), 2 ) self.tlength = math.sqrt( self.tlength ) def calc_ranking(self): for t in self.tokens.getTokens(): it = self.index.getIndexToken( t ) dtf = self.calc_dtf( len( self.index.bank.urls ), t ) for d in it.urlList.iterkeys(): tf = self.calc_tf( it.urlList[ d ] ) wtq = tf * dtf wtf = self.calc_tf( self.get_query_term_length( t ) ) wtd = wtf * dtf if d not in self.ranking: self.ranking[ d ] = 0 self.ranking[ d ] += ( wtq * wtd ) for d in self.ranking: self.ranking[ d ] = self.ranking[ d ] / ( self.lengths[ d ] * self.tlength ) def calc_tf(self, val): return ( 1 + math.log10( val ) ) def calc_dtf(self, n, token): return math.log10( float( n ) / float( self.index.getDocumentFrequency( token ) ) ) def get_query_term_length(self, token): count = 0 for t in self.tokens.getTokens(): if t == token: count = count + 1 return count def printScoring(self): printable = "["; for t in self.tokens.getTokens(): printable += "'%s', " % ( t ) printable = printable[:-2] + "]\n" for item in sorted( self.ranking.items(), key=lambda x: x[1], reverse=True ): printable += "%s:\t%.6f\n" % (item[0], item[1] ) print (printable) def printDocumentLength(self): printable = ""; for item in sorted( self.lengths ): printable += "%s:\t%.6f\n" % ( item, self.lengths[ item ] ) print (printable)
class Interpreter(object): def __init__(self, text): self.tokens = Tokenizer(text).tokenize() self.expr = [] def next_expression(self): expr = [self.tokens.pop(0)] if expr[0] == '{': c = self.tokens.pop(0) self.tokens.insert(0, '{'+c) state['{'+c] = State.brace(int(c)) return self.next_expression() elif type(state[expr[0]]) is Operator: for _ in range(state[expr[0]].arity): expr += self.next_expression() return expr def evaluate(self, new_expr=True): if new_expr: self.expr = self.next_expression() if len(self.expr) == 0: return None if re.fullmatch(r'while', self.expr[0]): return self.process_while() if re.fullmatch(r'for', self.expr[0]): return self.process_for() elif re.fullmatch(r'[0-9]+', self.expr[0]): return int(self.expr[0]) elif type(state[self.expr[0]]) is Operator: return self.eval_token(self.expr.pop(0)) return self.expr[0] def eval_token(self, token): if re.fullmatch(r'[0-9]+', token): return int(token) elif type(state[token]) is Operator: return state[token](*[self.eval_token(self.expr.pop(0)) for _ in range(state[token].arity)]) return token def has_token(self): return len(self.tokens) > 0 def process_while(self): a = self.next_expression() b = self.next_expression() while True: self.expr = a[:] result = self.evaluate(new_expr=False) if not result: return None self.expr = b[:] self.evaluate(new_expr=False) def process_for(self): preset = self.next_expression() boolean = self.next_expression() increment = self.next_expression() body = self.next_expression() self.expr = preset[:] self.evaluate(new_expr=False) while True: self.expr = boolean[:] result = self.evaluate(new_expr=False) if not result: return None self.expr = body[:] self.evaluate(new_expr=False) self.expr = increment[:] self.evaluate(new_expr=False)
def segmentRawSentences(self, tokenizer: Tokenizer, strs: str): sentence = tokenizer.joinSentences(tokenizer.tokenize(strs)) return self.segmentTokenizedString(sentence)
def create_index(rootDir): MB_100 = 100000000 docID = 0 partials = [] print('Starting...') # Set this to the path where you downloaded the developer JSON files rootDir = Path(rootDir) # partial index index = defaultdict(dict) partial_num = 0 curr_size = getsizeof(index) # Traverse the directory tree starting at rootDir for dirName, subdirList, fileList in os.walk(rootDir): # Grab JSON files for fname in fileList: # Get the path to the JSON file: # e.g. C:\Users\bchau\Desktop\Projects\developer.zip\DEV\aiclub_ics_uci_edu path = Path(dirName).joinpath(fname) # Open the JSON file with above path with open(path) as f: # Load JSON file document = json.load(f) stems = Tokenizer.tokenize_and_stem(document['content']) # if document is low info, output url to text file and skip indexing if len(stems) == 0: print('Low informational value document: ' + document['url'] + '\n') else: # stem tokens and get word frequency from the document word_freq = Tokenizer.get_word_freq(stems) # get term frequencies token_tf = Indexer.word_freq_to_tokentf(word_freq) # Convert token_tf list to indices entries = Indexer.tokentf_to_postingtf(docID, token_tf) # add to index and increment size counter appropriately for token, postings in entries.items(): for docID, posting in postings.items(): if token not in index: curr_size += getsizeof({}) index[token][docID] = posting curr_size += getsizeof(docID) curr_size += getsizeof(posting) print('Indexed w/ tf: ', docID) # if index grows larger than 100 MB write it to disk if curr_size > MB_100: # append the name of the file that we wrote to to a list partials.append( Indexer.write_partial_to_disk( index, partial_num)) partial_num += 1 index.clear() curr_size = getsizeof(index) docID += 1 # perform merge on partial indices Indexer.merge_partials(partials, get_num_docs(rootDir)) for filename in partials: filePath = Path(filename) if filePath.exists(): os.remove(filePath) print('Finished.')
scores = [] token_set = set(tokens) for class_label, prior in self.prior.items(): cat_score = 0 cat_score+=math.log(prior) for token in token_set: if token in self.condprobs: cat_score+=math.log(self.condprobs[token][class_label]) scores.append((class_label, cat_score)) sum_exp = sum(math.exp(s[1]) for s in scores) norm_probs = [(class_label, math.exp(s)/sum_exp) for class_label,s in scores] return sorted(norm_probs, key = itemgetter(1), reverse=True)[:top_n] # #testing code # if __name__ == '__main__': import json import sys from Tokenizer import Tokenizer t = Tokenizer() prior = json.load(open(sys.argv[2],'r')) condprobs = json.load(open(sys.argv[3],'r')) q = sys.argv[1].decode('ascii','ignore') NB = NaiveBayes(prior, condprobs) tokens = t.tokenize(q) print tokens print NB.classify(tokens) print NB.score(tokens,2)
if (fac == None): print("Error, identifier: " + identifier + "is not assigned a value") sys.exit() self.pt.moveToParent() elif (alt == 2): self.pt.moveToChild(1) fac = self.execExp() self.pt.moveToParent() else: print("Error executing Fac, alt not found") sys.exit() return fac # returns the name of an identifiers def execId(self): self.pt.moveToChild(0) identifier = self.pt.getName() self.pt.moveToParent() return identifier # executes and returns a number from the parse tree def execInt(self): self.pt.moveToChild(0) num = int(self.pt.getName()) self.pt.moveToParent() return num i = Interpreter(ParseTree(Tokenizer(str(sys.argv[1]))))
def __init__(self, text): self.tokens = Tokenizer(text).tokenize() self.expr = []
class MetaDataSentimentAnalisis: def __init__(self, kernel_type): self.kernel = kernel_type self.tokenizer = Tokenizer() self.__init_classifier(kernel_type) pass def __init_classifier(self, kernel_type): if kernel_type == 'rbf': self.classifier = svm.SVC(C=1, gamma=0.0000001) elif kernel_type == 'linear': self.classifier = svm.SVC(kernel='linear') elif kernel_type == 'liblinear': self.classifier = svm.LinearSVC() else: self.classifier = svm.SVC() self.vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True) def train_text(self, train_data_path, metadata_path, train = True, parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}, store = False, storepath = ""): parser = TweetParser() tweets = parser.parse(train_data_path, metadata_path) train_data = [] train_labels = [] clean_train = "" polarity = "NONE" if store and not train: print ("store needs train ... train=True") train = True for tweet in tweets: clean_train = self.tokenizer.cleanText(tweet.content) train_data.append(clean_train) polarity = self.checkPolarity(tweet.polarity) train_labels.append(polarity) #print clean_train #print polarity # Create feature vectors train_vectors = self.vectorizer.fit_transform(train_data) if train: Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma' : gammas} self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1) self.classifier.fit(train_vectors, train_labels) if store: joblib.dump(self.classifier, storepath) return train_labels def svc_param_selection(self,X, y, nfolds): Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma' : gammas} grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=4, verbose=1) grid_search.fit(X, y) grid_search.best_params_ return grid_search.best_params_ def train_features(self, train_data_path, metadata_path, features= [], train = True, store = True, storepath = ""): (X, train_labels) = self.buildFeaturesFromCorpus(train_data_path, metadata_path, features) if train: Cs = [0.001, 0.01, 0.1, 1, 10] gammas = [0.001, 0.01, 0.1, 1] param_grid = {'C': Cs, 'gamma' : gammas} self.classifier = grid_search.GridSearchCV(self.classifier, param_grid, cv=3, n_jobs=4, verbose=1) self.classifier.fit(X, train_labels) if store: joblib.dump(self.classifier, storepath) return train_labels def buildFeaturesFromCorpus(self,train_data_path, metadata_path, features= [], test = False): parser = TweetParser() tweets = parser.parse(train_data_path, metadata_path) train_data = [] train_labels = [] featuresDict ={} for tweet in tweets: if not tweet.jsonData: continue for feature in features: if feature not in featuresDict: featuresDict[feature] = tweet_feature(feature)#creamos la feature if feature == "geo" or feature == "place" or feature == "favorited": if feature in tweet.jsonData: pass #print str(type(tweet.jsonData[feature])) #print tweet.jsonData[feature] if feature in tweet.jsonData: dataValue = tweet.jsonData[feature] featuresDict[feature].data.append(dataValue) featuresDict[feature].count +=1 featuresDict[feature].totalCount += 1 else: if featuresDict[feature].type == bool: featuresDict[feature].data.append(False) elif featuresDict[feature].type == int: featuresDict[feature].data.append(0) else: featuresDict[feature].data.append("null") featuresDict[feature].totalCount += 1 polarity = self.checkPolarity(tweet.polarity) train_labels.append(polarity) for feature in features: featuresDict[feature].print_stats() if feature == "text": Xt = self.buildFeatureDataMatrix(featuresDict[feature], test) train_data.append(Xt) else: train_data.append(self.buildFeatureDataMatrix(featuresDict[feature], test)) if train_data: #Xm = scipy.sparse.csc_matrix(train_data) #Xm = Xm.transpose(True) # X : sparse matrix, [n_samples, n_features] # Tf-idf-weighted document-term matrix. #print Xt.shape #print Xm.shape X = scipy.sparse.hstack(train_data) else: X = Xt return (X,train_labels) def test(self, test_data_path = "", metadata_path = "", load = False, model = ""): parser = TweetParser() tweets = parser.parse(test_data_path, metadata_path) test_data = [] for tweet in tweets: test_data.append(self.tokenizer.cleanText(tweet.content)) test_vectors = self.vectorizer.transform(test_data) if load and model: self.classifier = joblib.load(model) predictions = self.classifier.predict(test_vectors) return predictions def test_features(self, test_data_path = "", metadata_path = "", features= [], load = False, model = ""): (X, train_labels) = self.buildFeaturesFromCorpus(test_data_path, metadata_path, features, True) if load and model: self.classifier = joblib.load(model) predictions = self.classifier.predict(X) return (predictions, train_labels) def predict(self, test_data_path = "", metadata_path = "", model = ""): parser = TweetParser() tweets = parser.parse(test_data_path, metadata_path) test_data = [] for tweet in tweets: test_data.append(self.tokenizer.cleanText(tweet.content)) test_vectors = self.vectorizer.transform(test_data) self.classifier = joblib.load(model) predictions = self.classifier.predict(test_vectors) for text, prediction in test_data, predictions: print text print prediction def checkPolarity(self, polarity_elements): polarity = 'NONE' if polarity_elements: for polarity_element in polarity_elements: polarity = polarity_element if not polarity == 'NONE': break return polarity def buildFeatureDataMatrix(self, feature, test = False): featureData = [] for data in feature.data: if feature.name == "text": featureData.append(self.tokenizer.cleanText(data)) else: featureData.append(data) if feature.name == "text": if not test: text_features = self.vectorizer.fit_transform(featureData) else: text_features=self.vectorizer.transform(featureData) return text_features else: return scipy.sparse.csc_matrix(featureData).transpose(True)
class Parser: def __init__(self, input_content, file=True, debug=False): self.tokenizer = Tokenizer(input_content, file) self.tokenizer.tokenize() self.token = None self.is_debug = debug self.pos = 0 self.info = [] self.debug = [] self.eps = False def take_token(self, token_type_list, eps=False): self.eps = False self.token = self.next_token() if self.is_debug: print(self.token) if self.token.type not in token_type_list: if eps: self.token = self.prev_token() self.eps = True if self.is_debug: print(self.eps) return 'EPS' else: self.raise_error( "Unexpected token: %s at '%s' on line %d column %d; Expected one of %s" % (self.token.type, self.token.value, self.token.line, self.token.column, str(token_type_list))) return self.token def take_tokens(self, token_type_list): for token_type in token_type_list: self.take_token(token_type) def prev_token(self): self.pos -= 1 return self.tokenizer.tokens[self.pos - 1] def next_token(self): self.pos += 1 return self.tokenizer.tokens[self.pos - 1] @staticmethod def raise_error(msg='Unexpected error occurred'): raise RuntimeError('Parser error; %s' % msg) def log_info(self, stmt): self.info.append(stmt + ' - OK') if self.is_debug: print(self.info[-1]) def log_debug(self, stmt): self.debug.append(stmt) if self.is_debug: print(stmt) def reset(self): self.pos = 0 self.info = [] self.debug = [] self.eps = False def parse(self): self.reset() self.start() return def start(self): # self.next_token() self.x_elems() return # noinspection SpellCheckingInspection def x_elems(self): self.log_debug('x_elems') self.take_token(['ANCHOR_OP', 'VAR_NAME', 'EOF']) if self.token.type == 'EOF': return else: self.x_elem() self.x_elems() def x_elem(self): self.log_debug('x_elem') if self.token.type == 'ANCHOR_OP': self.x_style() elif self.token.type == 'VAR_NAME': self.x_var_def() else: self.raise_error() def x_style(self): self.log_debug('x_style') self.x_items() self.log_info('style') def x_items(self): self.log_debug('x_items') eps = self.take_token( ['VAR_NAME', 'STRING', 'ANCHOR_OP', 'ANCHOR_CLOSE'], eps=True) if eps == 'EPS': return elif self.token.type == 'VAR_NAME': self.x_var_def() elif self.token.type == 'STRING': self.take_token(['COLON']) self.x_value() self.x_properties() self.take_token(['SEMICOLON']) elif self.token.type == 'ANCHOR_OP': self.x_style() elif self.token.type == 'ANCHOR_CLOSE': return else: self.raise_error() self.log_info('items') self.x_items() def x_properties(self): self.log_debug('x_properties') self.x_value(True) if self.eps: return else: self.x_properties() def x_var_def(self): self.log_debug('x_var_def') self.take_token(['COLON']) self.x_value() self.take_token(['SEMICOLON']) self.log_info('var_def') def x_value(self, eps_mode=False): self.log_debug('x_value') eps = self.take_token( ['CONSTANT', 'STRING', 'VAR_NAME', 'FUNCTION_OP'], eps_mode) if eps_mode and eps == 'EPS': pass elif self.token.type == 'CONSTANT': pass elif self.token.type == 'STRING': pass elif self.token.type == 'FUNCTION_OP': self.x_values() self.take_token(['BRACKET_CLOSE']) elif self.token.type == 'VAR_NAME': pass else: self.raise_error() def x_values(self): self.log_debug('x_values') self.x_value(True) eps = self.take_token(['COMMA'], True) if eps == 'EPS': pass else: self.x_values()
class DataPrepper(): def __init__(self, PATH_TO_STOP_WORDS, PATH_TO_TRAIN_LIST): self.PATH_TO_STOP_WORDS = PATH_TO_STOP_WORDS self.PATH_TO_CLASS_LIST = PATH_TO_TRAIN_LIST self.Tokenizer = Tokenizer(self.PATH_TO_STOP_WORDS) # Set up class-specific constants self.fpc = self.load_paths_to_training_text( ) # F.P.C means filename_path_classnames self.class_names = self.get_class_names() print("[DataPrepper] Instantiated!") """ Processes the dataset and returns the feature vectors of each of the training and test sets (positively and negatively classified) Note: train_pos_doc_map = datasets[0][0] train_neg_doc_map = datasets[0][1] test_pos_doc_map = datasets[1][0] test_neg_doc_map = datasets[1][1] """ def run(self, class_name, cross_validation_mode=False): print("[DataPrepper] Running for", class_name, ", prepping datasets...") datasets = None if cross_validation_mode: datasets = self.prep_dataset(class_name, 1.0, 1.0) else: datasets = self.prep_dataset(class_name, 0.8, 0.9) print( "Sample sizes - Train: %d positives + %d negatives, Test: %d positives + %d negatives" % (len(datasets[0][0]), len(datasets[0][1]), len( datasets[1][0]), len(datasets[1][1]))) # Text normalization: tokenization, stop word removal & stemming print("[DataPrepper] Tokenizing datasets...") datasets_df_pair = self.tokenize_datasets(datasets) datasets = datasets_df_pair[0] doc_freq_map = datasets_df_pair[1] # Construct df from datasets doc_freq_map = self.cull_doc_freq(doc_freq_map, 50) print("Num of words in vocabs: Vocab=%d" % len(doc_freq_map.keys())) print("Num of words in vocabs: Culled Vocab=%d" % len(doc_freq_map.keys())) N_docs = len(datasets[0][0]) + len(datasets[0][1]) + len( datasets[1][0]) + len(datasets[1][1]) datasets = self.setup_tfidf_vector(N_docs, datasets, doc_freq_map) # === FOR DEBUGGING === # tryA = datasets[0][0][list(datasets[0][0].keys())[0]] # tryB = datasets[0][1][list(datasets[0][1].keys())[0]] # tryC = datasets[1][0][list(datasets[1][0].keys())[0]] # tryD = datasets[1][1][list(datasets[1][1].keys())[0]] # print('---SEE WHAT FEATURE VECTORS LOOK LIKE FOR %s---' % class_name) # print('try A:', tryA, 'dim:', len(tryA)) # print('try B:', tryB, 'dim:', len(tryB)) # print('try C:', tryC, 'dim:', len(tryC)) # print('try D:', tryD, 'dim:', len(tryD)) # print('---END SEE WHAT FEATURE VECTORS LOOK LIKE---') f_vector_pos_train = self.setup_feature_vectors_for_classifier( datasets[0][0]) f_vector_neg_train = self.setup_feature_vectors_for_classifier( datasets[0][1]) f_vector_pos_test = [] f_vector_neg_test = [] if cross_validation_mode: f_vector_pos_test = self.setup_feature_vectors_for_classifier( datasets[1][0]) f_vector_neg_test = self.setup_feature_vectors_for_classifier( datasets[1][1]) return [[f_vector_pos_train, f_vector_neg_train], [f_vector_pos_test, f_vector_neg_test], doc_freq_map] #===========================================================================# # TEXT NORMALIZATION # Functions to facilitate text normalization for all datasets #===========================================================================# def tokenize_datasets_OLD(self, datasets): for i in range(len(datasets)): for j in range(len(datasets[i])): dict_class_documents = datasets[i][j] for doc_name in dict_class_documents.keys(): dict_class_documents[doc_name] = \ self.Tokenizer.tokenize(dict_class_documents[doc_name]) return datasets def tokenize_datasets(self, datasets): doc_freq_map = {} for i in range(len(datasets)): for j in range(len(datasets[i])): dict_class_documents = datasets[i][j] for doc_name in dict_class_documents.keys(): dict_class_documents[doc_name] = self.Tokenizer.tokenize( dict_class_documents[doc_name]) # Construct doc freq map on-the-fly tokens_processed_before = [] for token in dict_class_documents[doc_name]: if token not in tokens_processed_before: # unique tokens in a doc tokens_processed_before.append(token) if token not in doc_freq_map.keys( ): # if token is newly found, initialize doc_freq_map[token] = [doc_name] else: doc_freq_map[token].append( doc_name ) # since the word appears in this doc return [datasets, doc_freq_map] #===========================================================================# # TF-IDF VECTORIZATION # Compute TF-IDF vectors for every document #===========================================================================# def setup_tfidf_vector(self, NUM_DOCS, datasets, doc_freq_map): vocab = list(doc_freq_map.keys()) for i in range(len(datasets)): for j in range(len(datasets[i])): dict_class_documents = datasets[i][j] for doc_name in dict_class_documents.keys(): doc = dict_class_documents[doc_name] f_vector = [0] * len(vocab) for token in doc: if token in vocab: tf = doc.count(token) log_tf = (1 + log(tf)) if tf > 0 else 0.0 log_idf = log(NUM_DOCS / len(doc_freq_map[token])) w = log_tf * log_idf f_vector[vocab.index(token)] = w dict_class_documents[doc_name] = f_vector return datasets def cull_doc_freq(self, doc_freq_map, threshold_num_docs): culled_df_map = {} for word in doc_freq_map.keys(): if len(doc_freq_map[word]) > threshold_num_docs: culled_df_map[word] = doc_freq_map[word] return culled_df_map #===========================================================================# # CONSTRUCT VOCABULARY & DOC FREQ MAP # Set up data structures that hold the vocab and doc freq of every word #===========================================================================# def setup_vocab(self, dataset, threshold): count_vocab = {} vocab = [] for doc_name in dataset.keys(): for token in dataset[doc_name]: if token not in count_vocab.keys(): count_vocab[token] = 0 else: count_vocab[token] += 1 if token not in vocab and count_vocab[token] >= threshold: vocab.append(token) return vocab """ Sets up the doc frequency of words in a given dataset. A dataset is a dictionary of this format: { 'doc_name' : ['Here', 'are', ...] } Returns a dictionary containing the document frequency of all words in the chosen dataset in this format: { 'Here' : 12, 'are' : 56 ... } """ def setup_doc_freq(self, dataset): df = {} for doc_name in dataset.keys(): for word in dataset[doc_name]: if word not in df.keys(): df[word] = [doc_name] else: if doc_name not in df[word]: df[word].append(doc_name) return df def get_chisq_vocab(self, data_pos_vocab, data_neg_vocab, docs_pos, docs_neg, threshold): combined_vocabs = self.union_vocabs(data_pos_vocab, data_neg_vocab) N_pos_docs = len(docs_pos.keys()) N_neg_docs = len(docs_neg.keys()) feature_selected_vocab = [] for word in (combined_vocabs): N_pos_docs_containing_word = self.get_num_contains_word( docs_pos, word) N_pos_docs_not_containing_word = N_pos_docs - N_pos_docs_containing_word N_neg_docs_containing_word = self.get_num_contains_word( docs_neg, word) N_neg_docs_not_containing_word = N_neg_docs - N_neg_docs_containing_word # no. of training docs that: N_00 = N_neg_docs_not_containing_word # in negative class, do not contain w N_01 = N_pos_docs_not_containing_word # in positive class, do not contain w N_10 = N_neg_docs_containing_word # in negative class, contain w N_11 = N_pos_docs_containing_word # in positive class, contain w chisq = 0 if not (N_00 == 0 and N_01 == 0): chisq = ((N_11 + N_10 + N_01 + N_00) * pow(N_11 * N_00 - N_10 * N_01, 2)) / \ ((N_11 + N_01) * (N_11 + N_10) * (N_10 + N_00) * (N_01 + N_00)) if chisq > threshold: feature_selected_vocab.append(word) return feature_selected_vocab def get_num_contains_word(self, df, word): docs_containing_word = [] for doc_name in df.keys(): if word in df[doc_name]: docs_containing_word.append(doc_name) return len(docs_containing_word) def union_vocabs(self, vocab_1, vocab_2): unioned_vocab = [] for word in vocab_1: if word not in unioned_vocab: unioned_vocab.append(word) for word in vocab_2: if word not in unioned_vocab: unioned_vocab.append(word) return unioned_vocab #===========================================================================# # CONSTRUCT FEATURE VECTORS FOR EACH CLASS # Compute feature vectors representing each class' text document #===========================================================================# def setup_feature_vectors(self, vocab, dataset): fea_datasets = [] dataset_f_vectors = [] for doc_name in dataset.keys(): doc = dataset[doc_name] DOC_N = len(doc) f_vector = [0] * len(vocab) # Count word occurrence with reference to vocab for word in doc: if word in vocab: f_vector[vocab.index(word)] += 1 # Normalize by the number of words in a document for k in range(len(f_vector)): f_vector[k] = f_vector[k] / DOC_N # Finished processing a feature vector of a doc dataset_f_vectors.append(f_vector) return dataset_f_vectors """ Stack map of {'doc_name': [1.81, 0, 6.8...] ... } into a list of feature vectors """ def setup_feature_vectors_for_classifier(self, doc_tfidf_vector_map): f_vectors = [] for doc_name in doc_tfidf_vector_map.keys(): f_vectors.append(doc_tfidf_vector_map[doc_name]) return f_vectors #===========================================================================# # CONSTRUCT THE DATASET # Retrieves texts from training and test files #===========================================================================# """ Prepares the datasets we will need for training and testing. Splits our corpus into positive and negative train/test sets. Returns a list of 2 pairs of tuples - one for train & test set, where each tuple contains 2 dictionaries - one for positives & negatives """ def prep_dataset(self, positive_class_name, pos_frac, neg_frac_per_class): positives_fpc = self.get_texts_for_class(positive_class_name) N_pos_docs = len(positives_fpc) negatives_fpc_map = {} N_neg_docs = 0 # Set up a dictionary containing { 'neg_class_name': [['53886', 'path_to_doc', 'c2'], [...] ...] } for class_name in self.class_names: if not (class_name == positive_class_name): negatives_fpc_map[class_name] = self.get_texts_for_class( class_name) N_neg_docs += 1 # Split the positive classes into train and test sets N_pos_train = int(N_pos_docs * pos_frac) N_pos_test = int(N_pos_docs * (1 - pos_frac)) positives = self.sample_N_pos_texts(positives_fpc, N_pos_train) train_positives = positives[0] test_positives = positives[1] # Sample and split the negatives classes into train and test sets negatives = self.sample_N_neg_texts(negatives_fpc_map, neg_frac_per_class) train_negatives = negatives[0] test_negatives = negatives[1] return [[train_positives, train_negatives], [test_positives, test_negatives]] """ Reads the train-class-list or test-class-list file to retrieve all the paths to each document Returns a list of 3-tuples in the format: [[doc_name, path_to_doc, class_name], ...] """ def load_paths_to_training_text(self): filepath_class_file = open(self.PATH_TO_CLASS_LIST, 'r') filepath_class_lines = filepath_class_file.readlines() filename_path_classnames = [] for ln in filepath_class_lines: filepath_class_pair = self.Tokenizer.split_on_whitespace_from_back( ln) filename = self.Tokenizer.split_on_slash_from_back( filepath_class_pair[0])[1] filepath_class_pair[1] = self.Tokenizer.strip_newline( filepath_class_pair[1]) result = [] result.append(filename) result.append(filepath_class_pair[0]) result.append(filepath_class_pair[1]) filename_path_classnames.append(result) return filename_path_classnames """ Gets the list of all the class names in our corpus Returns a list of [String] class names """ def get_class_names(self): result = [] for filename_path_classname in self.fpc: candidate_class_name = filename_path_classname[2] if candidate_class_name not in result: result.append(candidate_class_name) return result """ Gets a list of filenames classified as `class_name` Returns a list of up to LIMIT (optional) 3-tuples in the format: [[doc_name, path_to_doc, class_name], ...] for the specified class_name """ def get_texts_for_class(self, class_name, LIMIT=None): result = [] for filename_path_classname in self.fpc: if filename_path_classname[2] == class_name: if LIMIT != None and len(result) > LIMIT: break else: result.append(filename_path_classname) return result """ Retrieves the first N texts from a positive class Returns a tuple of a 1.) dictionary of N positive training entries, 2.) dictionary of N positive testing entries the format: [ { '[doc_name]' : 'some long string of text...' ... }, { '[doc_name]' : 'some long string of text...' ... } ] """ def sample_N_pos_texts(self, pos_fpc, N): result_train = {} result_test = {} count = 0 # Obtain the documents from each class specified in class_names # First N documents are sent for training, the remaining are sent for testing for fpc in pos_fpc: doc_name = fpc[0] path_to_doc = fpc[1] class_name = fpc[2] f = open(path_to_doc, 'r', encoding='latin1') if count < N: result_train[doc_name] = f.read() count += 1 else: result_test[doc_name] = f.read() return (result_train, result_test) """ Retrieves the first N / len(negative_classes) texts from each of the specified list of negative classes Returns a tuple of a 1.) dictionary of N negative training entries, 2.) dictionary of N negative testing entries the format: [ { '[doc_name]' : 'some long string of text...' ... }, { '[doc_name]' : 'some long string of text...' ... } ] """ def sample_N_neg_texts(self, negatives_fpc_map, neg_frac_per_class): negative_classes = negatives_fpc_map.keys() neg_train_map = {} neg_test_map = {} for class_name in negative_classes: N_docs = len(negatives_fpc_map[class_name]) N_train = int(N_docs * neg_frac_per_class) for i in range(N_docs): # Retrieve elements in fpc 3-tuple doc_tuple = negatives_fpc_map[class_name][i] doc_name = doc_tuple[0] path_to_doc = doc_tuple[1] class_name = doc_tuple[2] f = open(path_to_doc, 'r', encoding='latin1') if i < N_train: neg_train_map[doc_name] = f.read() else: neg_test_map[doc_name] = f.read() return (neg_train_map, neg_test_map)
#------------------------------- import torch from torch.utils.data import DataLoader from MyDataSet import MyDataSet from Tokenizer import Tokenizer from pad import pad from Config import Config from Seq2Seq import Seq2Seq if __name__ == '__main__': source_path = '../data/test/source.txt' target_path = '../data/test/target.txt' vocab_path = '../data/vocab.txt' model_path = '../model/model.pth' tokenizer = Tokenizer(vocab_path) config = Config() fr = open('../result/test.txt', 'w', encoding='utf-8-sig') # 存储预测结果 loader = DataLoader(dataset=MyDataSet(source_path, target_path, tokenizer), batch_size=config.batch_size, shuffle=True, num_workers=2, collate_fn=pad, drop_last=False) # 最后一个batch数据集不丢弃 device = torch.device('cpu') model = Seq2Seq(config) model.to(device) # 加载模型 checkpoint = torch.load(model_path, map_location=device) model.load_state_dict(checkpoint['model'])
object = utils.getObject(wordtags, size, i) firedNode = self.findFiredNode(object) if firedNode.depth > 0: if firedNode.conclusion == "B": sb = sb + " " + wordtags[i].form else: sb = sb + "_" + wordtags[i].form else: if wordtags[i].tag == "B": sb = sb + " " + wordtags[i].form else: sb = sb + "_" + wordtags[i].form return sb.strip() # def segmentRawString(self,strs:str)->str: # return self.segmentTokenizedString(" ".join(Tokenizer.tokenize(strs))) def segmentRawSentences(self, tokenizer: Tokenizer, strs: str): sentence = tokenizer.joinSentences(tokenizer.tokenize(strs)) return self.segmentTokenizedString(sentence) if __name__ == "__main__": rdrsegment = RDRSegmenter() tokenizer = Tokenizer() t = time.time() output = rdrsegment.segmentRawSentences( tokenizer, "hôm nay tôi đau bụng cảm sốt nhức đầu ho khan tại Hà Nội có triệu chứng bị Covid 19" ) print(output, time.time() - t)
''' Created on Nov 2, 2011 @author: Calvin ''' if __name__ == '__main__': from ProgramParse import ProgramParse from Tokenizer import Tokenizer t = Tokenizer() prog = ProgramParse() f = open("../testinputs/program.bl") prog.parse(f, t) print "\n\n" + str(prog)
def __init__(self): self.classes = {} self.documentCount = 0 self.tokenizer = Tokenizer("");
def main(argv): # construct our pipeline list reading from command line args # still need to figure out best way to pass parameters on command # line global verbose global norm split = None transforms = [] for arg in argv[0].split(","): if arg == "toke": transforms.append(Tokenizer()) elif arg == "stem": transforms.append(Stemmer()) elif arg == "stem-porter": transforms.append(Stemmer(mode='Porter')) elif arg == "stem-lancaster": transforms.append(Stemmer(mode='Lancaster')) elif arg == "stem-lemmatize": transforms.append(Stemmer(mode='Lemmatize')) elif arg == "vect": transforms.append(Vectorizer()) elif arg == "vect-tfidf": transforms.append(Vectorizer(mode='TFIDF')) elif arg == "vect-count": transforms.append(Vectorizer(mode='Count')) elif arg == "vect-lda-2": transforms.append(Vectorizer(mode='LDA', ldaSplits=2)) elif arg == "vect-lda-10": transforms.append(Vectorizer(mode='LDA', ldaSplits=10)) elif arg == "vect-lda-25": transforms.append(Vectorizer(mode='LDA', ldaSplits=25)) elif arg == "vect-lda-50": transforms.append(Vectorizer(mode='LDA', ldaSplits=50)) elif arg == "vect-lda-150": transforms.append(Vectorizer(mode='LDA', ldaSplits=150)) elif arg == "vect-lda-500": transforms.append(Vectorizer(mode='LDA', ldaSplits=500)) elif arg == "svm": transforms.append(Model('svm')) elif arg == "nb": transforms.append(Model('nb')) elif arg == "lr": transforms.append(Model('lr')) elif arg == "nn": transforms.append( Model('nn', inputDim=10000) ) #Configured for Vectorizer with vectors limited to 1000 elif arg == "norm": norm = True elif arg == "no-verb": verbose = False elif arg == "split-sentences": split = "sentences" elif arg == "nn-optim": # Memory optimized neural network. transforms.append( OptimNN(vecMode='TFIDF', epochs=2, batchSize=2048)) else: raise Exception(f"Invalid transformer {arg}") pipe = Pipeline(transforms, norm=norm) # read our data (hardcoded for now) df0 = pd.read_pickle( "./data/democrat_comments.pkl") #.sample(frac = 0.05) # DEBUG ONLY df1 = pd.read_pickle( "./data/republican_comments.pkl") #.sample(frac = 0.05) # DEBUG ONLY if (split is not None): if (verbose): print('Splitting Democrat comments') df0 = splitRows(df0, mode=split, verbose=verbose) if (verbose): print('Splitting Republican comments') df1 = splitRows(df1, mode=split, verbose=verbose) label0 = df0.subreddit.iloc[0] label1 = df1.subreddit.iloc[0] # concatenate and clean our data X = pd.concat([df0.body, df1.body], ignore_index=True) y = pd.concat([df0.subreddit, df1.subreddit], ignore_index=True).replace(to_replace=[label0, label1], value=[0, 1]) # split into training and test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) if (verbose): print('Applying Transforms and Training Model') print('Train Data:', train_path) print('Test Data:', test_path) print('Transforms:', argv[0]) # fit our data pipe.fit_transform(X_train, y_train) # do the prediction y_pred = pipe.predict(X_test) results = pipe.validate(y_pred, y_test, True, True) # get most suprising misclassifications for class 0 print("Most suprising texts misclassified as class 0") idx_list = heapq.nlargest(5, results[2][0], key=lambda x: x[1]) for i, (idx, prob) in enumerate(idx_list): print(f"{i}) probability class 1 = {prob}\n{X_test[idx]}, \n") # get most suprising misclassifications for class 1 print("Most suprising texts misclassified as class 1") idx_list = heapq.nlargest(5, results[2][1], key=lambda x: x[1]) for i, (idx, prob) in enumerate(idx_list): print(f"{i}) probability class 0 = {prob}\n{X_test[idx]}\n")
def build_data(input_file): tokenizer = Tokenizer(input_file) rows = tokenizer.next_int() cols = tokenizer.next_int() num_cars = tokenizer.next_int() num_rides = tokenizer.next_int() bonus = tokenizer.next_int() total_steps = tokenizer.next_int() tokenizer.next_line() # building all rides rides = [] for i in range(num_rides): a = tokenizer.next_int() b = tokenizer.next_int() x = tokenizer.next_int() y = tokenizer.next_int() s = tokenizer.next_int() f = tokenizer.next_int() tokenizer.next_line() rides.append(Ride(a, b, x, y, s, f, i)) cars = [] for i in range(num_cars): cars.append(Car()) return rides, cars, bonus, num_rides, num_cars
sources = glob.glob( "./{}/*.jack".format(in_file)) if os.path.isdir(in_file) else [in_file] for source in sources: base_name = source[:-len(".jack")] in_file = source tokenizer_outfile = "{}T.xml".format(base_name) compilation_engine_outfile = "{}.xml".format(base_name) with open(tokenizer_outfile, 'w') as tokenizer_file_out: tokenizer_xml_writer = XMLWriter(tokenizer_file_out) tokenizer_xml_writer.open_tag('tokens') with open(in_file, 'rb') as f_in: tokenizer = Tokenizer(f_in) while True: try: tokenizer_xml_writer.write_token(tokenizer.advance()) except TokenizerReachedEndOfFileException: print('Reached end') break tokenizer_xml_writer.close_tag('tokens') with open(compilation_engine_outfile, 'w') as ce_file_out: ce_xml_writer = XMLWriter(ce_file_out) with open(in_file, 'rb') as f_in: tokenizer = Tokenizer(f_in)
from sequence_mask_loss import sequence_mask_loss from Config import Config from Seq2Seq import Seq2Seq from rouge import Rouge from convert_to_RougePattern import convert_to_RougePattern import random if __name__ == '__main__': source_path = '../data/train/source.txt' target_path = '../data/train/target.txt' eval_source_path = '../data/eval/source.txt' eval_target_path = '../data/eval/target.txt' vocab_path = '../data/vocab.txt' log_path = '../log/log.txt' log = open(log_path, 'w', encoding='utf-8') tokenizer = Tokenizer(vocab_path) config = Config() rouge = Rouge() # 评估指标 # 训练集 loader = DataLoader(dataset=MyDataSet(source_path, target_path, tokenizer), batch_size=config.batch_size, shuffle=True, num_workers=0, collate_fn=pad, drop_last=False) # 最后一个batch数据集不丢弃 # 评估集 eval_loader = DataLoader(dataset=MyDataSet(eval_source_path, eval_target_path, tokenizer), batch_size=config.batch_size, shuffle=True,
return self.exp.EvalExp() # Reading the file and creating a list of words programfile = sys.argv[1] filename = sys.argv[2] infile = open(filename, "r") tail = [] for line in infile: for string in line.split(): tail.append(string) infile.close() # Opening output file for writing nameAndFormat = filename.split(".txt") outfilename = "testoutput.txt".join(nameAndFormat) outfile = open(outfilename, "w") prettyprint = open(programfile, "w") # Creating a tokenizer t = Tokenizer(tail) Program = Prog() Program.ParseProg() Program.PrintProg(prettyprint) Program.ExecProg() prettyprint.close() outfile.close()
def analyzeFile(self, filename): tokenizer = Tokenizer(filename) tokenizer.tokenize() compiler = CompilationEngine(filename) compiler.compile() return
def __init__(self): print("[Tester] instantiated!") self.Tokenizer = Tokenizer(PATH_TO_STOP_WORDS)