Beispiel #1
0
 def generate(self):
     """
     Generates text using provided source text data
     """
     if self.data is not None:
         with open(self.data, 'r') as data_file:
             text = data_file.read()
             tagged = StanfordTagger(verbose=True).tag(text)
         if self.save_tagged_data:
             save_file = self.data + ".tags"
             with open(self.data + ".tags", 'w') as save_file:
                 save = csv.writer(save_file)
                 save.writerows(tagged)
     elif self.tagged_data is not None:
         with open(self.tagged_data, 'r') as data_file:
             tagged = csv.reader(data_file)
             tagged = [tuple(row) for row in tagged]
     detok = StanfordDetokenizer()
     model = LVGNgramGenerator(tagged, self.n_gram)
     methods = {'b': model.generate_without_pos, 'n': model.generate, 't': model.generate_alternative}
     while True:
         num_words = input('\nEnter the length in words to generate (or "q" to exit): ')
         if num_words.isdigit():
             method = input('Enter a generation method {b: baseline, n: normal, t: tuned}: ')
             if method in methods:
                 print("\n\t" + detok.detokenize(' '.join(methods[method](int(num_words)))))
         elif num_words == 'q':
             break
Beispiel #2
0
def twitter(file: str, n: int, words: int, method: str):
    ''' Returns the list of generated tweets
       file -> name of file
       n -> number of n-grams
       words -> how many tweets to generate
       method -> method used to generate
    '''
    file = open(file, 'r')
    text = file.read()
    tagged = StanfordTagger(verbose=True).tag(text)
    file.close()

    detok = StanfordDetokenizer()
    model = LVGNgramGenerator(tagged, n)
    methods = {'b': model.generate_without_pos, 'n': model.generate, 't': model.generate_alternative}
    result = []
    x = 0
    while x < words:
        num_words = str(random.randint(20, 65))
        if num_words.isdigit():
            if method in methods:
                result.append(detok.detokenize(' '.join(methods[method](int(num_words)))))
        x += 1
    final = []
    for i in result:
        if i.startswith("forced"):
            pass
        else:
            final.append(i)
    return final
Beispiel #3
0
 def __init__(self):
     self.spanishEnglish = {}
     self.englishSpanish = {}
     self.text = []
     self.WordTag = namedtuple('WordTag', 'word tag')
     self.tagger = StanfordTagger('./postag/models/english-bidirectional-distsim.tagger',
                                  './postag/stanford-postagger-3.1.4.jar')
     self.patterns = [(["NN", "RB", "VBD"], ["NN", "VBD", "RB"]),    # Adverbs at end
                      (["VBD", "DT", "NNS"], ["DT", "NNS", "VBD"]),  # Subject before verb
                      (["VBN", "DT", "NN"], ["DT", "NN", "VBN"]),    # Subject before verb
                      # Prep. after possesive
                      # of close his beauty -> his beauty of close
                      (["IN", "JJ", "PRP\$", "NN"], ["PRP\$", "NN", "IN", "JJ"]),
                      # Prep before noun
                      # water sweet -> water salt
                      (["NN", "JJ"], ["JJ", "NN"]),
                      (["VBN", "PRP"], ["PRP", "VBN"]),
                      (["DT", "PRP", "VB*"], ["PRP", "VB*", "DT"])]
     self.replacements = [("all the days", "always"),
                          ("him same", "himself"),
                          ("by what", "why"),
                          ("young", "youth"),
                          ("no is", "is not")]
Beispiel #4
0
class Translator:

    def __init__(self):
        self.spanishEnglish = {}
        self.englishSpanish = {}
        self.text = []
        self.WordTag = namedtuple('WordTag', 'word tag')
        self.tagger = StanfordTagger('./postag/models/english-bidirectional-distsim.tagger',
                                     './postag/stanford-postagger-3.1.4.jar')
        self.patterns = [(["NN", "RB", "VBD"], ["NN", "VBD", "RB"]),    # Adverbs at end
                         (["VBD", "DT", "NNS"], ["DT", "NNS", "VBD"]),  # Subject before verb
                         (["VBN", "DT", "NN"], ["DT", "NN", "VBN"]),    # Subject before verb
                         # Prep. after possesive
                         # of close his beauty -> his beauty of close
                         (["IN", "JJ", "PRP\$", "NN"], ["PRP\$", "NN", "IN", "JJ"]),
                         # Prep before noun
                         # water sweet -> water salt
                         (["NN", "JJ"], ["JJ", "NN"]),
                         (["VBN", "PRP"], ["PRP", "VBN"]),
                         (["DT", "PRP", "VB*"], ["PRP", "VB*", "DT"])]
        self.replacements = [("all the days", "always"),
                             ("him same", "himself"),
                             ("by what", "why"),
                             ("young", "youth"),
                             ("no is", "is not")]

    def read_data(self, fileName):
        f = codecs.open(fileName, encoding='utf-8')
        for line in f:
            m = re.match('(?P<englishWord>[^:]+): (?P<spanishWord>.+)$', line)
            self.spanishEnglish[m.group('spanishWord').encode('utf-8')] = m.group('englishWord')

    def tokenTranslate(self, fileName):
        f = codecs.open(fileName, encoding='utf-8')
        for line in f:
            tokens = line.split()
            for token in tokens:
                match = re.match(u'\W*(?P<word>\w+)(?P<punctuation>\W*)', token, re.UNICODE)
                if match:
                    strippedToken = match.group('word').lower()
                    if strippedToken.encode('utf-8') in self.spanishEnglish:
                        yield self.spanishEnglish[strippedToken.encode('utf-8')]
                    if match.group('punctuation'):
                        yield match.group('punctuation')
                else:
                    yield token

    def patternsMatch(self, text, pattern):
        """
        Check if the tags in text match the tags in pattern.
        text is a list of WordTag tuples (word, tag)
        pattern is a list of patterns.
        patternsMatch([WordTag("a", "NN"), WordTag("b", "VB")], ["NN", "VB"]) -> True
        patternsMatch([WordTag("a", "NN"), WordTag("b", "VB")], ["NN", "V*"]) -> True
        patternsMatch([WordTag("a", "NN"), WordTag("b", "VB")], ["NN", "PP"]) -> False
        """
        return all([re.match(pair[1], pair[0].tag) for pair in zip(text, pattern)])

    def reorderPatterns(self, pattern, sub, text):
        """
        Look for pattern in text and reorder according to the rules in sub.
        pattern is a list of strings (eg. ["VB", "NN", "IT])
        sub is a way to reorder (permutation of) pattern (eg. ["IT", "NN", "VB])
        text is a list of (word, tag) that should be reordered.
        For instance, reorderPatterns(["VB", "NN"], ["NN", "VB"], ["IT", "VB", "NN"])
        changes text to ["IT", "NN", "VB"]
        """
        patternLen = len(pattern)
        for i, wordTag in enumerate(text[:-patternLen]):
            if wordTag.tag == pattern[0] and self.patternsMatch(text[i:i + patternLen], pattern):
                # Create a pattern -> wordTag in text mapping
                mapping = {pattern[j]: text[i + j] for j in range(patternLen)}
                # Replace the existing sub-list with the reordered list
                text[i:i + patternLen] = [mapping[p] for p in sub]

    def wordsMatch(self, englishWords, replacementWords):
        return all([pair[0] == pair[1] for pair in zip(englishWords,
                                                       replacementWords)])

    def makeReplacements(self, englishWords):
        """
        Make word replacements in self.replacements.
        englishWords is a list of English words
        """
        for replacement in self.replacements:
            wordsToReplace = replacement[0].split()
            replacementLen = len(wordsToReplace)
            for i in range(len(englishWords) - replacementLen):
                # Check if we should do replacement here
                if (englishWords[i] == wordsToReplace[0] and
                    self.wordsMatch(englishWords[i:i + replacementLen],
                                    wordsToReplace)):
                    replacementWords = replacement[1].split()

                    # Insert new words at appropriate location
                    englishWords[i:i + replacementLen] = replacementWords
        return englishWords

    def marshallSentences(self, sentences):
        """
        Make stuff a little prettier. Capitalize beginnings of sentences,
        removes spaces at the end.
        """
        result = []
        for sentence in sentences:
            sentence = sentence.strip()
            if sentence:
                sentence = sentence[0].upper() + sentence[1:]
                sentence += '.'
                result.append(sentence)
        return result

    def translate(self, fileName):
        self.text = [word for word in self.tokenTranslate(fileName)]
        self.text = ' '.join(self.text).split('.')
        self.tagged = self.tagger.tag([sentence + '.' for sentence in self.text])
        # Convert to named tuples so we can access w.word, w.tag
        self.tagged = [self.WordTag(w[0], w[1]) for w in self.tagged]

        for pattern in self.patterns:
            self.reorderPatterns(pattern[0], pattern[1], self.tagged)

        englishWords = [tag[0] for tag in self.tagged]
        englishWords = self.makeReplacements(englishWords)

        sentences = ' '.join(englishWords).split('.')
        print ' '.join(self.marshallSentences(sentences))