def all_combsi(lol): lens = [len(x) for x in lol] num_combs = reduce(lambda x, y: x * y, lens, 1) for i in xrange(num_combs): tmp = [0] * len(lol) for j in xrange(len(tmp)): tmp[j] = lol[j][i % lens[j]] i = i / lens[j] yield tmp
def all_combsi(lol): lens = [len(x) for x in lol] num_combs = reduce(lambda x, y: x*y, lens, 1) for i in xrange(num_combs): tmp = [0]*len(lol) for j in xrange(len(tmp)): tmp[j] = lol[j][i % lens[j]] i = i / lens[j] yield tmp
def pk(ref, hyp, k=None, boundary='1'): """ Compute the Pk metric for a pair of segmentations A segmentation is any sequence over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used to mark the edge of a segmentation. >>> '%.2f' % pk('0100'*100, '1'*400, 2) '0.50' >>> '%.2f' % pk('0100'*100, '0'*400, 2) '0.50' >>> '%.2f' % pk('0100'*100, '0100'*100, 2) '0.00' :param ref: the reference segmentation :type ref: str or list :param hyp: the segmentation to evaluate :type hyp: str or list :param k: window size, if None, set to half of the average reference segment length :type boundary: str or int or bool :param boundary: boundary value :type boundary: str or int or bool :rtype: float """ if k is None: k = int(round(len(ref) / (ref.count(boundary) * 2.))) err = 0 for i in xrange(len(ref) - k + 1): r = ref[i:i + k].count(boundary) > 0 h = hyp[i:i + k].count(boundary) > 0 if r != h: err += 1 return err / (len(ref) - k + 1.)
def score(self, text): score = 0 ngrams = self.ngrams.__getitem__ for i in xrange(len(text) - self.L + 1): if text[i : i + self.L] in self.ngrams: score += ngrams(text[i : i + self.L]) else: score += self.floor return score
def pk(ref, hyp, k=None, boundary='1'): """ Compute the Pk metric for a pair of segmentations A segmentation is any sequence over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used to mark the edge of a segmentation. >>> '%.2f' % pk('0100'*100, '1'*400, 2) '0.50' >>> '%.2f' % pk('0100'*100, '0'*400, 2) '0.50' >>> '%.2f' % pk('0100'*100, '0100'*100, 2) '0.00' :param ref: the reference segmentation :type ref: str or list :param hyp: the segmentation to evaluate :type hyp: str or list :param k: window size, if None, set to half of the average reference segment length :type boundary: str or int or bool :param boundary: boundary value :type boundary: str or int or bool :rtype: float """ if k is None: k = int(round(len(ref) / (ref.count(boundary) * 2.))) err = 0 for i in xrange(len(ref)-k +1): r = ref[i:i+k].count(boundary) > 0 h = hyp[i:i+k].count(boundary) > 0 if r != h: err += 1 return err / (len(ref)-k +1.)
def create_col_array(matrix, matrix_terms_len): array = [] from nltk.compat import xrange for i in xrange(matrix_terms_len): col = np.array(matrix[:, i].T.toarray()) array.append(col) return array
def pk(ref, hyp, k=None, boundary="1"): """ Compute the Pk metric for a pair of segmentations A segmentation is any sequence over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used to mark the edge of a segmentation. >>> s1 = "00000010000000001000000" >>> s2 = "00000001000000010000000" >>> s3 = "00010000000000000001000" >>> pk(s1, s1, 3) 0.0 >>> pk(s1, s2, 3) 0.095238... >>> pk(s2, s3, 3) 0.190476... :param ref: the reference segmentation :type ref: str or list :param hyp: the segmentation to evaluate :type hyp: str or list :param k: window size, if None, set to half of the average reference segment length :type boundary: str or int or bool :param boundary: boundary value :type boundary: str or int or bool :rtype: float """ if k is None: k = int(round(len(ref) / (ref.count(boundary) * 2.0))) n_considered_seg = len(ref) - k + 1 n_same_ref = 0.0 n_false_alarm = 0.0 n_miss = 0.0 for i in xrange(n_considered_seg): bsame_ref_seg = False bsame_hyp_seg = False if boundary not in ref[(i + 1) : (i + k)]: n_same_ref += 1.0 bsame_ref_seg = True if boundary not in hyp[(i + 1) : (i + k)]: bsame_hyp_seg = True if bsame_hyp_seg and not bsame_ref_seg: n_miss += 1 if bsame_ref_seg and not bsame_hyp_seg: n_false_alarm += 1 prob_same_ref = n_same_ref / n_considered_seg prob_diff_ref = 1 - prob_same_ref prob_miss = n_miss / n_considered_seg prob_false_alarm = n_false_alarm / n_considered_seg return prob_miss * prob_diff_ref + prob_false_alarm * prob_same_ref
def createNumpyArray(sentences, windowsize, word2Idx, label2Idx): unknownIdx = word2Idx['UNK'] paddingIdx = word2Idx['MASK'] xMatrix = [] yVector = [] wordCount = 0 unknownWordCount = 0 for sentence in sentences: targetWordIdx = 0 for targetWordIdx in xrange(len(sentence)): # Get the context of the target word and map these words to the index in the embeddings matrix wordIndices = [] for wordPosition in xrange(targetWordIdx - windowsize, targetWordIdx + windowsize + 1): if wordPosition < 0 or wordPosition >= len(sentence): wordIndices.append(paddingIdx) continue word = sentence[wordPosition] wordIndices.append(word) # Get the label and map to int labelIdx = label2Idx[sentence[targetWordIdx][1]] xMatrix.append(wordIndices) yVector.append(labelIdx) print "Unknowns: %.2f%%" % (unknownWordCount / (float(wordCount)) * 100) return (np.asarray(xMatrix, dtype='int32'), np.asarray(yVector, dtype='int32'))
def test_spinner(): rand_review = random.choice(positive_reviews) s = rand_review.text.lower() print("Original:", s) word_tokens = nltk.tokenize.word_tokenize(s) for index in xrange(len(word_tokens) - 2): if random.random() < 0.2: # 20% chance of replacement k = (word_tokens[index], word_tokens[index + 2]) if k in trigram_probabilities: w = random_sample(trigram_probabilities[k]) word_tokens[index + 1] = w print( "Spun:", " ".join(word_tokens).replace(" .", ".").replace(" '", "'").replace( " ,", ",").replace("$ ", "$").replace(" !", "!"))
def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: An iterator of non-projective parses. rtype: iter(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() for index, token in enumerate(tokens): self._graph.nodes[index] = { 'word': token, 'deps': [], 'rel': 'NTOP', 'address': index, } for head_node in self._graph.nodes.values(): deps = [] for dep_node in self._graph.nodes.values(): if (self._grammar.contains(head_node['word'], dep_node['word']) and head_node['word'] != dep_node['word']): deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) < 2: if len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while i >= 0: if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in xrange(len(stack) - 1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses # ensure 1 root, every thing has 1 head for analysis in analyses: if analysis.count(-1) > 1: # there are several root elements! continue graph = DependencyGraph() graph.root = graph.nodes[analysis.index(-1) + 1] for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1): head_address = head_index + 1 node = graph.nodes[address] node.update({ 'word': token, 'address': address, }) if head_address == 0: rel = 'ROOT' else: rel = '' graph.nodes[head_index + 1]['deps'][rel].append(address) # TODO: check for cycles yield graph
def _positions(self): return xrange(self.num_leaves() + 1)
from bs4 import BeautifulSoup from nltk.compat import xrange positive_reviews = BeautifulSoup(open("positive.review").read()) positive_reviews = positive_reviews.findAll('review_text') # extract the trigrams # Key -> first and last word # value -> possible middle words trigrams = {} for review in positive_reviews: s = review.text.lower() tokens = nltk.tokenize.word_tokenize(s) for i in xrange(len(tokens) - 2): k = (tokens[i], tokens[i + 2]) if k not in trigrams: trigrams[k] = [] trigrams[k].append(tokens[i + 1]) # turn each array of middle words into a probability vector trigram_probabilities = {} for k, words in trigrams.items(): # create a dictionary of words -> count if len(set(words)) > 1: # only do this when there are different possibilities for a middle word d = {} n = 0 for w in words: if w not in d:
def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: An iterator of non-projective parses. rtype: iter(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() for index, token in enumerate(tokens): self._graph.nodes[index] = { 'word': token, 'deps': [], 'rel': 'NTOP', 'address': index, } for head_node in self._graph.nodes.values(): deps = [] for dep_node in self._graph.nodes.values() : if ( self._grammar.contains(head_node['word'], dep_node['word']) and head_node['word'] != dep_node['word'] ): deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) < 2: if len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while i >= 0: if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in xrange(len(stack) - 1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses # ensure 1 root, every thing has 1 head for analysis in analyses: if analysis.count(-1) > 1: # there are several root elements! continue graph = DependencyGraph() graph.root = graph.nodes[analysis.index(-1) + 1] for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1): head_address = head_index + 1 node = graph.nodes[address] node.update( { 'word': token, 'address': address, } ) if head_address == 0: rel = 'ROOT' else: rel = '' graph.nodes[head_index + 1]['deps'][rel].append(address) # TODO: check for cycles yield graph
def approxrand(a, b, **kwargs): """ Returns an approximate significance level between two lists of independently generated test values. Approximate randomization calculates significance by randomly drawing from a sample of the possible permutations. At the limit of the number of possible permutations, the significance level is exact. The approximate significance level is the sample mean number of times the statistic of the permutated lists varies from the actual statistic of the unpermuted argument lists. :return: a tuple containing an approximate significance level, the count of the number of times the pseudo-statistic varied from the actual statistic, and the number of shuffles :rtype: tuple :param a: a list of test values :type a: list :param b: another list of independently generated test values :type b: list """ shuffles = kwargs.get('shuffles', 999) # there's no point in trying to shuffle beyond all possible permutations shuffles = \ min(shuffles, reduce(operator.mul, xrange(1, len(a) + len(b) + 1))) stat = kwargs.get('statistic', lambda lst: sum(lst) / len(lst)) verbose = kwargs.get('verbose', False) if verbose: print('shuffles: %d' % shuffles) actual_stat = fabs(stat(a) - stat(b)) if verbose: print('actual statistic: %f' % actual_stat) print('-' * 60) c = 1e-100 lst = LazyConcatenation([a, b]) indices = list(range(len(a) + len(b))) for i in xrange(shuffles): if verbose and i % 10 == 0: print('shuffle: %d' % i) shuffle(indices) pseudo_stat_a = stat(LazyMap(lambda i: lst[i], indices[:len(a)])) pseudo_stat_b = stat(LazyMap(lambda i: lst[i], indices[len(a):])) pseudo_stat = fabs(pseudo_stat_a - pseudo_stat_b) if pseudo_stat >= actual_stat: c += 1 if verbose and i % 10 == 0: print('pseudo-statistic: %f' % pseudo_stat) print('significance: %f' % ((c + 1) / (i + 1))) print('-' * 60) significance = (c + 1) / (shuffles + 1) if verbose: print('significance: %f' % significance) if betai: for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]: print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi))) return (significance, c, shuffles)
def apply(self, chart, grammar): for prod in grammar.productions(empty=True): for index in xrange(chart.num_leaves() + 1): new_edge = FeatureTreeEdge.from_production(prod, index) if chart.insert(new_edge, ()): yield new_edge
def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: A set of non-projective parses. rtype: list(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() self._graph.nodelist = [] # Remove the default root for index, token in enumerate(tokens): self._graph.nodelist.append({'word':token, 'deps':[], 'rel':'NTOP', 'address':index}) for head_node in self._graph.nodelist: deps = [] for dep_node in self._graph.nodelist: if self._grammar.contains(head_node['word'], dep_node['word']) and not head_node['word'] == dep_node['word']: deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) > 1: print("No parses found.") return False elif len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while(i >= 0): if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: # print stack_item if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) # print len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in xrange(len(stack) -1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) # print stack elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True # print 'Index on stack:', i, index_on_stack if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses graphs = [] #ensure 1 root, every thing has 1 head for analysis in analyses: root_count = 0 root = [] for i, cell in enumerate(analysis): if cell == -1: root_count += 1 root = i if root_count == 1: graph = DependencyGraph() graph.nodelist[0]['deps'] = root + 1 for i in range(len(tokens)): node = {'word':tokens[i], 'address':i+1} node['deps'] = [j+1 for j in range(len(tokens)) if analysis[j] == i] graph.nodelist.append(node) # cycle = graph.contains_cycle() # if not cycle: graphs.append(graph) return graphs