def hmm(file): """Given an open FILE, e.g. from the open(filename) function, Read pre-tagged sentences of WSJ, one per line. Return an HMM, here represented as a tuple containing (1) the transition probabilities, and (2) the emmission probabilities.""" transitions = DefaultDict(DefaultDict(0)) emissions = DefaultDict(DefaultDict(0)) wordcounts = DefaultDict(0) # For each sentence (one per line) for line in file.xreadlines(): # for each word in the sentence (space separated) prevtag = 'START' # Before each sentence, begin in START state for taggedword in line.split(): (word, tag) = taggedword.split('/') transitions[prevtag][tag] += 1 emissions[tag][word] += 1 wordcounts[word] += 1 # At test time we will need estimates for "unknown words"---the words # the words that never occurred in the training data. One recommended # way to do this is to turn all training words occurring just once # into '<UNKNOWN>' and use this as the stand-in for all "unknown words" # at test time. Below we make all the necessary transformations # to '<UNKNOWN>'. for tag, dict in emissions.items(): for word, count in dict.items(): if wordcounts[word] == 1: del emissions[tag][word] emissions[tag]['<UNKNOWN>'] += 1 # Here you need to add code that will turn these dictionaries # of counts into dictionaries of smoothed conditional probabilities return (transitions, emissions)
def bigrams(words): """Given an array of words, returns a dictionary of dictionaries, containing occurrence counts of bigrams.""" d = DefaultDict(DefaultDict(0)) for (w1, w2) in zip([None] + words, words + [None]): d[w1][w2] += 1 return d
def parse(sentence): global grammar # Create the table; index j for rows, i for columns length = len(sentence) score = [None] * (length) prob_table = DefaultDict(float) trace = {} list2=[] for j in range(length): score[j] = [None] * (length+1) for i in range(length+1): score[j][i] = [] # Fill the diagonal of the table with the parts-of-speech of the words for k in range(1,length+1): results = producers(sentence[k-1]) for item in results: try: prob = grammar[item][sentence[k-1],] except: prob = grammar[item]['<unk>',] prob_table[k-1,k, item] = prob score[k-1][k].extend(results) #Weighted CYK for width in range(2,length+1): for start in range(0,length+1-width): end = start + width for mid in range (start, end): args = None for x in score[start][mid]: for y in score[mid][end]: results = producers((x,y)) for item in results: prob1 = grammar[item][(x,y)] prob2 = prob1 + prob_table[start, mid, x] + prob_table[mid, end, y] check = start, end, item if check in prob_table: if prob2 > prob_table[start, end, item]: prob_table[start, end, item] = prob2 else: prob_table[start, end, item] = prob2 args2 = x, y, mid if check in trace: if prob2 >= prob_table[start, end, item]: args = x, y, mid trace[start, end, item] = args else: args = x, y, mid trace[start, end, item] = args if item not in score[start][end]: score[start][end].append(item) try: if prob_table[0, length, 'TOP']: return get_tree(sentence, trace, 0, length, 'TOP') except: print "",
def files2countdict(files): """Given an array of filenames, return a dictionary with keys being the space-separated, lower-cased words, and the values being the number of times that word occurred in the files.""" d = DefaultDict(0) for file in files: for word in open(file).read().split(): d[word.lower()] += 1 return d
def train_maxent (dirs): """Train and return a MaxEnt classifier. The datastructure returned is dictionary whose keys are ('classname','word') tuples. The values in the dictionary are the parameters (lambda weights) of the classifier. Note that this method does not return the list of classnames, but the caller has those available already, since it is exactly the 'dirs' argument. If you need to recover the classnames from the diciontary itself, you'd need to do something like: maxent = train_maxent(dirs) classes = list(set([c for (c,v) in maxent.keys()])) Some typical usage: dirs = ['spam','ham'] # where these are sub-directories of the CWD maxent = train_maxent(dirs) # interested in seeing the weight of "nigerian" in the "spam" class? lambda_spam_nigerian = maxent[('spam','nigerian')] # to classify a document scores = classify(maxent,dirs,"spam/file123") """ classes = dirs maxent = DefaultDict(0) # Gather the "constraints" and initialize all-zero maxent dictionary constraints = DefaultDict(0) for cls in classes: maxent[(cls,'DEFAULT')] = 0 print cls for file in glob.glob(cls+"/*"): for word in open(file).read().split(): word = word.lower() constraints[(cls,word)] += 1 for clss in classes: maxent[(clss,word)] = 0 # Remember the maxent features, and get the starting point for optimization features = maxent.keys() lambda0 = maxent.values() # Here call an optimizer to find the best lambdas lambdaopt = optimize.fminNCG(value, lambda0, gradient, args=(features,dirs), printmessg=1) # Put the final optimal parameters are in returned dictionary assert maxent.keys() == features # Make sure the keys have not changed order maxent2 = dict([(k,v) for (k,v) in zip(maxent.keys(),lambdaopt)]) return maxent2
def __init__(self, positive, negative, neutral, pos, start=0, finish=None, weight=0.2): self.positive = positive self.negative = negative self.neutral = neutral self.pos = pos self.weight = weight self.s = {} self.s0 = {} self.initialize_s() self.lemmas = sorted(self.s.keys()) self.lemma_count = len(self.lemmas) self.start = start self.finish = finish if self.finish == None or self.finish > self.lemma_count: self.finish = self.lemma_count self.a = DefaultDict(DefaultDict(0.0)) self.initialize_a()
def search(self, net, origin, destination, accept_single_edge=False): """Performs a search from origin to destination. """ # Initialize necessary structures/data self.__priority_queue = PriorityDict() self.__edges = DefaultDict(lambda id: EdgeData(net.getEdge(id))) self.__destination = self.__edges[destination.getID()] first = self.__edges[origin.getID()] first.previous_edge = None first.heuristic_cost = self.heuristic_cost(origin) if origin.getID() == destination.getID(): # If the origin and destination are the same, # they must still be changed from the original # (in this case meaningless) zero-cost first.state = EdgeData.UNVISITED else: first.state = EdgeData.OPEN if accept_single_edge: # Insert the first edge into the queue self.__priority_queue[first.getID()] = first.estimated_cost else: # Insert the neighbors of the first edge into the queue self.__visit_neighbors_of(first) # Main search body found_result = self.__search() # Reconstruct the result, if found if found_result: return self.__destination.reconstruct_path() else: return None
import sys import time import threading import Pyro4 import commands from dicts import DefaultDict from lock import Lock import operator Pyro4.config.SERIALIZERS_ACCEPTED.add( 'pickle') #pickle serializer for data transmission Pyro4.config.SERVERTYPE = "thread" #prespawned pool of thread server Pyro4.config.THREADPOOL_SIZE = 10 #number of threads spawned. clients = DefaultDict(DefaultDict( 0)) #clients[clientID][event] #dictionary to store registered clients. cache_scores = DefaultDict(DefaultDict( -1)) #score[team][event] #dictionary to store scores for a given event. cache_medals = DefaultDict( DefaultDict(-1) ) #medals[team][medalTypes] #dictionary to store medal count for a team. rwl = Lock() global database #database server object idNum = 1 #id of the server clientRegistry = {} #store the registered clients global cacophonix push = 0 #decide whether to use pull or push based cache pull = 0
def bigrams(words): d = DefaultDict(DefaultDict(0)) for (w1, w2) in zip([None] + words, words + [None]): d[w1][w2] += 1 return d
#!/usr/bin/python3 #Jeannelle Alford #jkalfor2 #A language ID program using a letter bigram model from dicts import DefaultDict import re import math file = open("LangId.train.English", "r") data = file.read() file.close() enDictUni = DefaultDict(0) enDictBi = DefaultDict(DefaultDict(1)) enDictUni[data[0]] += 1 for i in range(1, len(data)): enDictUni[data[i]] += 1 enDictBi[data[i-1]][data[i]] += 1 file = open("LangId.train.French", "r") data = file.read() file.close() frDictUni = DefaultDict(0) frDictBi = DefaultDict(DefaultDict(1)) frDictUni[data[0]] += 1 for i in range(1, len(data)): frDictUni[data[i]] += 1 frDictBi[data[i-1]][data[i]] += 1 file = open("LangId.train.Italian", "r")
#generates db.p database file. This is the pickle file. import threading import Pyro4 import commands from dicts import DefaultDict from lock import Lock import time import pickle Pyro4.config.SERIALIZERS_ACCEPTED.add( 'pickle') #pickle serializer for data transmission Pyro4.config.SERVERTYPE = "thread" #prespawned pool of thread server Pyro4.config.THREADPOOL_SIZE = 100 #number of threads spawned. scores = DefaultDict(DefaultDict( 0)) #score[team][event] #dictionary to store scores for a given event. medals = DefaultDict(DefaultDict( 0)) #medals[team][medalTypes] #dictionary to store medal count for a team. medalTime = DefaultDict(0) #time stamp corresponding to medal tally rwl = Lock() idNum = 0 databaseFile = None class dataUpdate(object): def __init__(self): self.events = ['skating', 'curling', 'snowboard'] #events list self.teams = ['Gauls', 'Romans'] #teams list