def __init__(self, filename):
     self.filename = filename
     self.alpha_re = re.compile(
         "^[a-zA-Z]+'?[a-zA-Z]*?$"
     )  # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere
     if stemming:
         self.stemmer = PorterStemmer()
     self.treebank_word_tokenizer = TreebankWordTokenizer()
class MySentences(object):
  def __init__(self, filename):
    self.filename = filename
    self.alpha_re = re.compile("^[a-zA-Z]+'?[a-zA-Z]*?$") # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere
    if stemming: 
      self.stemmer = PorterStemmer()
    self.treebank_word_tokenizer = TreebankWordTokenizer()
    # TODO: use http://www.nltk.org/howto/stem.html

  def __iter__(self):
    for line in open(self.filename):
      # TODO find a better way to distinguish sentence-initial caps from proper noun

      # sentences come like this:
      # 80  10:11 p.m., an unwanted person was reported on College Avenue.
      # 81  10:13 a.m., a report of shoplifting was investigated at Maine Smoke Shop on College Avenue.
      # 82  10:14: The proportion of A-levels awarded at least an A grade has fallen for the second year in a row.
      # 141529  But the debt ceiling may end up being the larger inflection point, especially as Obama staked out a hard-lined position against negotiating over that vote.

      sentence = line.decode("UTF8").split("\t", 1)[-1].replace(".", ' ')
      words = [word.lower() for word in self.treebank_word_tokenizer.tokenize(sentence) if re.match(self.alpha_re, word) ]
      if stemming:
        stems = [self.stemmer.stem(word) for word in words]
        yield stems
      else:
        yield words
class MySentences(object):
    def __init__(self, filename):
        self.filename = filename
        self.alpha_re = re.compile(
            "^[a-zA-Z]+'?[a-zA-Z]*?$"
        )  # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere
        if stemming:
            self.stemmer = PorterStemmer()
        self.treebank_word_tokenizer = TreebankWordTokenizer()
        # TODO: use http://www.nltk.org/howto/stem.html

    def __iter__(self):
        for line in open(self.filename):
            # TODO find a better way to distinguish sentence-initial caps from proper noun

            # sentences come like this:
            # 80  10:11 p.m., an unwanted person was reported on College Avenue.
            # 81  10:13 a.m., a report of shoplifting was investigated at Maine Smoke Shop on College Avenue.
            # 82  10:14: The proportion of A-levels awarded at least an A grade has fallen for the second year in a row.
            # 141529  But the debt ceiling may end up being the larger inflection point, especially as Obama staked out a hard-lined position against negotiating over that vote.

            sentence = line.decode("UTF8").split("\t", 1)[-1].replace(".", ' ')
            words = [
                word.lower()
                for word in self.treebank_word_tokenizer.tokenize(sentence)
                if re.match(self.alpha_re, word)
            ]
            if stemming:
                stems = [self.stemmer.stem(word) for word in words]
                yield stems
            else:
                yield words
Ejemplo n.º 4
0
prefix = ''
if len(sys.argv) > 2:
    prefix = sys.argv[2]

in_folder = sys.argv[1]


def ensure_dir(f):
    d = os.path.dirname(f)
    if d and not os.path.exists(d):
        os.makedirs(d)

if prefix:
	ensure_dir(prefix)

tokr = TreebankWordTokenizer()
word_tokenize = tokr.tokenize


stopwordlist = set()
with codecs.open("stop-en", "r", encoding='utf-8') as stopfile:
    for word in stopfile:
        stopwordlist.add(word.strip())


## tokeniser changes punctuation in unfortunate ways..
stopwordlist.add(u"``")
stopwordlist.add(u"''")
stopwordlist.add(u"link")

ipaddress = re.compile(r"[0-9]+(\.[0-9]+){3}$")
 def __init__(self, filename):
   self.filename = filename
   self.alpha_re = re.compile("^[a-zA-Z]+'?[a-zA-Z]*?$") # allow single apostrophes but not double apostrophes: note, this doesn't allow 'ere
   if stemming: 
     self.stemmer = PorterStemmer()
   self.treebank_word_tokenizer = TreebankWordTokenizer()
def rephrase(sentence, theme=None):
    new_version = []
    # def reducer(memo, char): #TODO: should not split on apostrophes in "can't" or "eatin'" but should for "spiders'"
    #   if (char.isalpha() or char == "'") == (memo[-1][-1].isalpha() or memo[-1][-1] == "'"):
    #     memo[-1] = memo[-1] + char
    #     return memo
    #   else:
    #     return memo + [char]
    # words = reduce( reducer , list(sentence.strip()), [' '])
    words = []
    my_sentence = sentence[0:]
    tokens = TreebankWordTokenizer().tokenize(sentence)
    for token in tokens:
        split_idx = my_sentence.index(token)
        if (len(my_sentence[0:split_idx]) > 0):
            words.append(my_sentence[0:split_idx])
        words.append(my_sentence[split_idx:(split_idx + len(token))])
        my_sentence = my_sentence[(split_idx + len(token)):]
    # words = [word for sublist in [[word, ' '] for word in TreebankWordTokenizer().tokenize(sentence)] for word in sublist ]

    # # TODO: put this in the API
    # bigrams_model_name = 'bigrams_model_nyt_sentences_5.5M_5.bin'
    # trigrams_model_name = "trigrams_model_nyt_sentences_5.5M_5.bin"
    # ngrams_models = {
    #   "bigrams": bigrams_model_name,
    #   "trigrams": trigrams_model_name
    # }
    # which_ngrams_model = "trigrams"
    # ngrams_model = Phrases.load(ngrams_models[which_ngrams_model])
    # print("ngrammized", ngrams_model[words])

    resp = requests.get("http://localhost:5000/phrases/" +
                        ','.join(words)).json()
    if len(resp["grouped"]) > 0:
        phrases = resp["grouped"]
    else:
        phrases = words

    for word in phrases:
        if word == '':
            continue
        if word in stopwords or not word[0].isalpha():
            new_version.append(word)
        else:
            # TODO: retain punctuation
            prefixes = []
            suffixes = []
            for thing, fix in [(list(word), prefixes),
                               (list(reversed(word)), suffixes)]:
                for char in thing:
                    if non_alpha_chars.match(char):
                        fix.append(char)
                    else:
                        break
            # print(word)
            # removes non-alphabetic chars in the word.
            # I don't recall why this is in here.
            # removing it so "you're" stays "you're" not "youre"
            # word = non_alpha_chars.sub('', word)
            # print(word)
            if stemming:
                word_stem = stemmer.stem(word)
                word_morphology = word_diff(word, word_stem)
                random_weird_changes = word_diff(word_stem, word)
                print("%s = %s + %s (- %s)" %
                      (word, word_stem, word_morphology, random_weird_changes))

                synonym_stem = get_synonym(word_stem, theme)
                if random_weird_changes:
                    print("random_weird_changes: %s" % random_weird_changes)
                    reversed_synonym_stem = list(
                        synonym_stem[::-1]
                    )  # [::-1] reverses a string because Python is intuitive
                    for subtraction in reversed(random_weird_changes):
                        if reversed_synonym_stem[0] == subtraction:
                            print("removed %s" % reversed_synonym_stem[0])
                            reversed_synonym_stem = reversed_synonym_stem[1:]
                        else:
                            break
                    synonym_stem = reversed_synonym_stem[::
                                                         -1]  # [::-1] reverses a string because Python is intuitive

                misspelled_synonym = ''.join(synonym_stem) + "".join(
                    word_morphology)
                synonym = spell_check.correct(misspelled_synonym)
            else:
                synonym = get_synonym(word, theme)
            print("new version" + str(new_version))
            new_version.append(''.join(prefixes) + synonym +
                               ''.join(reversed(suffixes)))
    print("new version" + str(new_version))
    return titlecase(''.join(new_version).strip().replace(" 's", "'s"))