def pcfg_generate(grammar): def non_terminal_into_terminal(non_terminal): nt_productions = grammar.productions(Nonterminal(str(non_terminal))) my_dict = dict() for pr in nt_productions: my_dict[pr.rhs()] = pr.prob() nt_productions_probDist = DictionaryProbDist(my_dict) genereted = nt_productions_probDist.generate() return list(genereted) def nts_into_ts(genereted_nts): for index in range(len(genereted_nts)): old_nt = genereted_nts[index] try: t = non_terminal_into_terminal(genereted_nts[index]) except Exception as e: continue productions_corpus.append(ProbabilisticProduction(Nonterminal(old_nt), tuple(t), **{'prob': 0})) genereted_nts[index] = nts_into_ts(Tree(old_nt, t)) return genereted_nts productions = grammar.productions() dic = dict() for pr in productions: dic[pr.rhs()] = pr.prob() productions_probDist = DictionaryProbDist(dic) genereted = productions_probDist.generate() productions_corpus.append(ProbabilisticProduction(Nonterminal('S'), genereted, **{'prob': 0})) genereted = Tree('S', [genereted[0], genereted[1]]) return nts_into_ts(genereted)
def non_terminal_into_terminal(non_terminal): nt_productions = grammar.productions(Nonterminal(str(non_terminal))) my_dict = dict() for pr in nt_productions: my_dict[pr.rhs()] = pr.prob() nt_productions_probDist = DictionaryProbDist(my_dict) genereted = nt_productions_probDist.generate() return list(genereted)
def generate_one_sample(grammar, item, depth): if depth > 0: if isinstance(item, Nonterminal): # get all relations starting with item np_productions = grammar.productions(item) dict = {} # record the probabilities for pr in np_productions: dict[pr.rhs()] = pr.prob() np_probDist = DictionaryProbDist(dict) # np_probDist.generate() samples a probable expansion # in contrast to the iterative for prod in grammar.productions(lhs=item): for frag in _generate_all(grammar, np_probDist.generate(), depth - 1): yield frag else: yield [item]
class NovelParagraph: def __init__(self, *args, **kwargs): if 'strategy' in kwargs: self.strategy = kwargs['strategy'] else: self.strategy = 'best' self.events = [] self.sentences = [] self.source_probability = {} self.querysets = {} self.sources = [] self.symmetrical_tokens = [] for source, probability in args: self.source_probability[source] = probability self.querysets[source] = NGram.objects.filter( **reconcile_old_style_source(source) ) self.sources.append(source) if self.querysets[source].count() == 0: raise InvalidSourceException("No NGrams with this source") self.source_probability = DictionaryProbDist(self.source_probability) def pick_queryset(self): return self.querysets[self.source_probability.generate()] def append_sentence(self): self.current_sentence = [] starter = self.pick_queryset().filter( sentence_starter=True ).order_by('?').first() self.current_sentence.append((starter.token_one, starter.tag_one)) self.current_sentence.append((starter.token_two, starter.tag_two)) self.current_sentence.append((starter.token_three, starter.tag_three)) while self.current_sentence[-1][0] not in TERMINAL_PUNCTUATION: new_word = self.new_word() self.current_sentence.append(new_word) self.sentences.append(self.current_sentence) def _get_others(self, original): sources = self.sources.copy() sources.remove(original) return [ NGram.objects.filter( **reconcile_old_style_source(source) ) for source in sources ] def _account_for_symmetrical_tokens(self, token): if token in SYMMETRICAL_TOKENS: self.symmetrical_tokens.append( ( SYMMETRICAL_TOKENS[token], SYMMETRICAL_TOKENS[token] ) ) def new_word(self): queryset = self.pick_queryset() ordered_querysets = [queryset] if len(self.sources) > 1: if queryset.first().twitter_user: source = queryset.first().twitter_user.twitter_id + '@twitter' else: source = 'document:'+queryset.first().document.name ordered_querysets = ordered_querysets + self._get_others(source) for qs in ordered_querysets: new_word = self.new_word_from_queryset(qs) if new_word: self._account_for_symmetrical_tokens(new_word[0]) if new_word[0] in TERMINAL_PUNCTUATION: if len(self.symmetrical_tokens) > 0: return self.symmetrical_tokens.pop() return new_word if len(self.symmetrical_tokens) > 0: return self.symmetrical_tokens.pop() return ('.', '.') def _best_matching_word(self, queryset): if self.strategy == 'grammar_only': return queryset.filter( tag_one=self.current_sentence[-2][1], tag_two=self.current_sentence[-1][1], ).order_by('?').first() else: nxt = queryset.filter( token_one__iexact=self.current_sentence[-2][0], token_two__iexact=self.current_sentence[-1][0], tag_one=self.current_sentence[-2][1], tag_two=self.current_sentence[-1][1], ).order_by('?').first() if not nxt: nxt = queryset.filter( token_one__iexact=self.current_sentence[-2][0], token_two__iexact=self.current_sentence[-1][0], ).order_by('?').first() return nxt def new_word_from_queryset(self, queryset): nxt = self._best_matching_word(queryset) if nxt: return (nxt.token_three, nxt.tag_three) else: return None @classmethod def _needs_space(self, token, previous_token, index): if index == 0: return False if previous_token in NO_TRAILING_SPACE_TOKENS: return False if token in NO_LEADING_SPACE_TOKENS: return False return True @classmethod def _join_and_postprocess_sentences(self, sentences): sentences = [''.join(sentence) for sentence in sentences] text = ' '.join(sentences) for pattern, replacement in REGEX_REPLACEMENTS: text = re.sub(pattern, replacement, text) return text def human_readable_sentences(self): final_output = [] for sent in self.sentences: output = [] for i, token in enumerate(sent): if NovelParagraph._needs_space(token[0], sent[i-1][0], i): output.append(' ') output.append(token[0]) final_output.append(output) return NovelParagraph._join_and_postprocess_sentences(final_output)