Exemple #1
0
class Translator:
    """docstring for Translator"""
    def __init__(self, *dictionaries):
        self.tokenizer = PunktWordTokenizer()
        self.dictionaries = dictionaries

    def translate(self, sentence):
        tokens = self.tokenizer.tokenize(sentence)

        def select_value(l):
            '''Should select the corect value'''
            #TODO: Implement this, right now has default behavior
            if isinstance(l, list):
                return l[0]
            else:
                return l

        def tr(word):
            for d in self.dictionaries:
                found = d[word]
                if found is not None:
                    return found
            else:
                return word

        return [select_value(tr(w)) for w in tokens]
Exemple #2
0
class Translator:
    """docstring for Translator"""
    def __init__(self, *dictionaries):
        self.tokenizer = PunktWordTokenizer()
        self.dictionaries = dictionaries

    def translate(self, sentence):
        tokens = self.tokenizer.tokenize(sentence)

        def select_value(l):
            '''Should select the corect value'''
            #TODO: Implement this, right now has default behavior
            if isinstance(l, list):
                return l[0]
            else:
                return l

        def tr(word):
            for d in self.dictionaries:
                found = d[word]
                if found is not None:
                    return found
            else:
                return word

        return [select_value(tr(w)) for w in tokens]
 def expandSet(kwd_set, root_elt):
     '''
     Expands a given set of keywords using the whole text and
     co-occurance probabilities
     @param kwd_set: Set<string>. List of mentioned kwds
     @param root_elt: etree.Element. The root element of the document
     '''
     lines = [elt.text for elt in root_elt.findall(".//line")]
     stop_words = set(stopwords.words("english"))
     tokenizer = PunktWordTokenizer()
     all_pairs = []
     for line in lines:
         for kwd in kwd_set:
             if re.match(kwd, line):
                 tokens = filter(lambda x: x not in stop_words and
                                     x not in string.punctuation,
                                 tokenizer.tokenize(line))
                 for token in tokens:
                     all_pairs.append((kwd, token))
     top_pairs = [pair for pair, freq in Counter(all_pairs).iteritems()
                  if freq >= 2]
     for pair in top_pairs:
         if KeywordExpander.verbose and pair[1] not in kwd_set:
             print "Expanding kwd with : ", pair[1]
         kwd_set.add(pair[1]);
         
     return kwd_set
def Document2Word2VecTrainingInputFormat(document):
	"""
		Given an input string of plain text sentences, first
		tokenizes the documents into each sentence, then tokenizes
		each sentence at the word level. Returns a list of lists where 
		each inner lists represents a sentence in the input and the contents are the individual words of the sentence.
	"""
	output = list()
	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
	word_detector = PunktWordTokenizer()
	sentences = sent_detector.tokenize(document)
	for sent in sentences:
		output.append(word_detector.tokenize(sent))
	return output
Exemple #5
0
def parse_ap(input_path, output_path):
    from nltk.corpus import stopwords
    stop = stopwords.words('english')

    from nltk.tokenize.punkt import PunktWordTokenizer
    tokenizer = PunktWordTokenizer()

    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()

    from string import ascii_lowercase

    doc_title = ""
    doc_content = []
    doc_count = 0

    input_file_stream = open(input_path, 'r')
    output_file_stream = open(output_path, 'w')
    for line in input_file_stream:
        line = line.strip().lower()

        if line == "<text>" or line == "</text>":
            continue

        if line == "<doc>":
            continue

        if line == "</doc>":
            output_file_stream.write("%s\t%s\n" %
                                     (doc_title, " ".join(doc_content)))
            doc_count += 1
            if doc_count % 1000 == 0:
                print("successfully parsed %d documents" % (doc_count))
            continue

        if line.startswith("<docno>"):
            line = line.lstrip("<docno>")
            line = line.rstrip("</docno>")
            doc_title = line.strip()
            continue

        #doc_content = [stemmer.stem(x) for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))];
        doc_tokens = [
            x for x in tokenizer.tokenize(line)
            if (min(y in ascii_lowercase for y in x))
        ]
        doc_content = [stemmer.stem(x) for x in doc_tokens if x not in stop]
Exemple #6
0
def parse_ap(input_path, output_path):
    from nltk.corpus import stopwords
    stop = stopwords.words('english')

    from nltk.tokenize.punkt import PunktWordTokenizer 
    tokenizer = PunktWordTokenizer()

    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer();
    
    from string import ascii_lowercase
    
    doc_title = "";
    doc_content = [];
    doc_count = 0;
      
    input_file_stream = open(input_path, 'r');
    output_file_stream = open(output_path, 'w');
    for line in input_file_stream:
        line = line.strip().lower();
        
        if line=="<text>" or line=="</text>":
            continue;
        
        if line=="<doc>":
            continue;
        
        if line=="</doc>":
            output_file_stream.write("%s\t%s\n" % (doc_title, " ".join(doc_content)));
            doc_count += 1;
            if doc_count%1000==0:
                print("successfully parsed %d documents" % (doc_count));
            continue;
         
        if line.startswith("<docno>"):
            line = line.lstrip("<docno>");
            line = line.rstrip("</docno>");
            doc_title = line.strip();
            continue;
            
        #doc_content = [stemmer.stem(x) for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))];
        doc_tokens = [x for x in tokenizer.tokenize(line) if (min(y in ascii_lowercase for y in x))];
        doc_content = [stemmer.stem(x) for x in doc_tokens if x not in stop];
Exemple #7
0
def neutralRemover(message):
    # Analyze the sentiment of a bite by comparing it to an array of "positively"
    # and "negatively" oriented words.
    buy, sell = 0, 0

    #Tokenize the message into individual words
    tokenizer = PunktWordTokenizer()

    #Assign a bullish or bearish sentiment to each word
    for word in tokenizer.tokenize(message):
        if word in pos:
            buy += 1
        if word in neg:
            sell += 1

    #Compare total bullish sentiment to total bearish sentiment
    if buy > sell:
        return 1

    if buy < sell:
        return -1
    return 0
Exemple #8
0
    # I assume it just toggles, no nesting
    in_open_quote = False
    in_open_anchor_tag = False
    
    previous_word_translated = False

    for line in f:
        line = line.replace('&#32;', ' ')\
                   .replace('>', '> ').replace('<', ' <')\
                   .replace(START_QUOTE, 'START_QUOTE ')\
                   .replace(END_QUOTE, ' END_QUOTE')\
                   .replace('.', ' .')

        new_words = list()

        for word in tkn.tokenize(line):
            if word.startswith('<A') or word.endswith('A>'):
                in_open_anchor_tag = not in_open_anchor_tag
            if word == 'START_QUOTE' or word == 'END_QUOTE':
                in_open_quote = not in_open_quote

            # should we be translating this word?
            to_translate = True
            if in_open_quote and not REPLACE_DIALOGUE:
                to_translate = False
            if not in_open_quote and not REPLACE_PROSE:
                to_translate = False
            if in_open_anchor_tag and IGNORE_HYPERREFS:
                to_translate = False
            if not word.isalnum():  # don't translate punctuation
                to_translate = False
Exemple #9
0
class NltkTools:
    _abbrevPattern  = re.compile(r"([\w][\w]?[.]){2,}$", re.UNICODE)
    _datePattern    = re.compile(r"(^|\s)(?:[\d]{2}){1,2}[.]$", re.UNICODE)
    _cleanerPattern = re.compile("(\w\w)([.?,:;!])(\w)(\w)", re.UNICODE)

    def __init__(self, tok=False, wtok=False, stok=False, pos=False, stem=False,
                 pos_model=None, abbrev_set=None):
        """@param abbrev_set: a set of frequent abbreviations."""
        if tok:
            wtok = True
            stok = True
            
        if wtok:
            self.wordTokenizer = PunktWordTokenizer()
            #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE)
            self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE)
            # Bragantino,2006.In fix this shit
        if stok:
            try:
                self.senTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
            except LookupError:
                sys.stderr.write("WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!")
                sys.stderr.write("WARNING: using an untrained sen_tokenizer")
                self.senTokenizer = PunktSentenceTokenizer()
        
        self.abbrev_set = (set(abbrev_set) if abbrev_set is not None else set())
        
        if pos:
            if pos_model is not None:
                self.posTagger = HunposTagger(pos_model, encoding="utf-8")
            else:
                self.posTagger = HunposTagger(os.path.join(os.environ['HUNPOS'], 'english.model'), encoding="utf-8")
        if stem:
            self.stemmer = WordNetLemmatizer()

    def tokenize(self, raw):
        """Runs sentence and then word tokenization. Does some abbreviation-
        detection to fix false sentence endings."""
        sentences = self.sen_tokenize(raw)
        tokens = [self.word_tokenize(sen) for sen in sentences]
        for i in reversed(xrange(len(tokens) - 1)):
            if ( self.is_abbrev(tokens[i][-1])
                 or NltkTools._abbrevPattern.match(tokens[i][-1]) is not None
                 and not NltkTools.starts_with_upper(tokens[i + 1][0])):
                tokens[i].extend(tokens[i + 1])
                tokens.pop(i + 1)
        return tokens
        

    def sen_tokenize(self, raw):
        """Tokenizes the raw text into sentences."""
        raw = NltkTools.cleanup_puncts(raw)
        return self.senTokenizer.tokenize(raw)

    def filter_long_sentences(self, raw, length=1024):
        """Filters "sentences" (non-whitespace character sequences longer than
        length) from the text."""
        # TODO: This looks nice but it is too generous with memory use
        return ' '.join(filter(lambda x: len(x) <= length, re.split(r"\s+", raw)))
        
    def sen_abbr_tokenize(self, raw):
        """Tokenizes the raw text into sentences, and tries to handle problems
        caused by abbreviations and such."""
        sentences = self.sen_tokenize(raw)
        for i in reversed(xrange(len(sentences) - 1)):
            if (NltkTools._abbrevPattern.search(sentences[i]) is not None
                    and not NltkTools.starts_with_upper(sentences[i + 1])):
                sentences[i] = ' '.join(sentences[i:i+2])
                sentences.pop(i + 1)
        return sentences

    @staticmethod
    def starts_with_upper(text):
        """Checks if the sentence starts with an upper case letter."""
        t = text.lstrip()
        return len(t) > 0 and t[0].isupper()
    
    @staticmethod
    def cleanup_puncts(raw):
        pos = 0
        cleaner = NltkTools._cleanerPattern.search(raw[pos:])
        while cleaner:
            if cleaner.group(2) == "." and not cleaner.group(3)[0].isupper():
                pos = cleaner.end()
            elif cleaner.group(1)[-1].isdigit() and cleaner.group(3)[0].isdigit():
                pos = cleaner.end()
            else:
                changed_part_string = cleaner.expand(r"\1\2 \3\4")
                raw = raw[:cleaner.start()] + changed_part_string + raw[cleaner.end():]
                pos = cleaner.end()
            cleaner = NltkTools._cleanerPattern.search(raw, pos)
        return raw
    
    def is_abbrev(self, tok):
        return tok in self.abbrev_set

    def word_tokenize(self, sen):
        """Tokenizes the sentence to words and splits the sentence ending
        punctuation mark from the last word and adds it as the last token."""
        tokens = self.wordTokenizer.tokenize(sen)
        if len(tokens) == 0:
            return []
        punktMatchObject = self.punktSplitter.match(tokens[-1])
        if punktMatchObject is not None and not self.is_abbrev(tokens[-1]):
            tokens = tokens[:-1] + list(punktMatchObject.groups())
        return tokens

    def pos_tag(self, sentokens):
        return self.posTagger.tag(sentokens)

    def stem(self, tokens):
        return ((tok, pos, self.stemmer.lemmatize(tok, penn_to_major_pos[pos])) for tok, pos in tokens)
        
    def tag_raw(self, raw_text):
        """Convenience method for tagging (a line of) raw text. The NltkTools
        instance must have been initialized with C{pos=True, stem=True, tok=True}.
        It is a generator: returns attribute array of one word at a time. The
        attributes are the word, the pos tag and the stem."""
        sens = self.tokenize(raw_text)
        pos_tagged = list(self.pos_tag(sen) for sen in sens)
        stemmed = list(self.stem(pos_tagged_sen) for pos_tagged_sen in pos_tagged)
        for sen in stemmed:
            for tok in sen:
                yield tok
            yield []
        return
Exemple #10
0
def get_tokens(text, remove_stopwords=True):
    tokenizer = PunktWordTokenizer()
    return [term for term in tokenizer.tokenize(text.lower()) \
        if (len(term) > 1 or term.isalpha()) and \
        (term not in stopwords.words('english') or (not remove_stopwords))]
Exemple #11
0
    # I assume it just toggles, no nesting
    in_open_quote = False
    in_open_anchor_tag = False

    previous_word_translated = False

    for line in f:
        line = line.replace('&#32;', ' ')\
                   .replace('>', '> ').replace('<', ' <')\
                   .replace(START_QUOTE, 'START_QUOTE ')\
                   .replace(END_QUOTE, ' END_QUOTE')\
                   .replace('.', ' .')

        new_words = list()

        for word in tkn.tokenize(line):
            if word.startswith('<A') or word.endswith('A>'):
                in_open_anchor_tag = not in_open_anchor_tag
            if word == 'START_QUOTE' or word == 'END_QUOTE':
                in_open_quote = not in_open_quote

            # should we be translating this word?
            to_translate = True
            if in_open_quote and not REPLACE_DIALOGUE:
                to_translate = False
            if not in_open_quote and not REPLACE_PROSE:
                to_translate = False
            if in_open_anchor_tag and IGNORE_HYPERREFS:
                to_translate = False
            if not word.isalnum():  # don't translate punctuation
                to_translate = False
class Word_Tokenizer():
    def __init__(self):
        self.tokenizer = PunktWordTokenizer()
    
    def tokenize(self, sentence):
        return self.tokenizer.tokenize(sentence)
Exemple #13
0
class NltkTools:
    _abbrevPattern = re.compile(r"([\w][\w]?[.]){2,}$", re.UNICODE)
    _datePattern = re.compile(r"(^|\s)(?:[\d]{2}){1,2}[.]$", re.UNICODE)
    _cleanerPattern = re.compile("(\w\w)([.?,:;!])(\w)(\w)", re.UNICODE)

    def __init__(self,
                 tok=False,
                 wtok=False,
                 stok=False,
                 pos=False,
                 stem=False,
                 pos_model=None,
                 abbrev_set=None,
                 stok_model=None):
        """@param abbrev_set: a set of frequent abbreviations."""
        if tok:
            wtok = True
            stok = True

        if wtok:
            self.wordTokenizer = PunktWordTokenizer()
            #self.punktSplitter = re.compile(r"^([\w\d]+)([.?,:;!])$", re.UNICODE)
            self.punktSplitter = re.compile(r"^(.+)([.?,:;!])$", re.UNICODE)
            # Bragantino,2006.In fix this shit
        if stok:
            if stok_model is not None:
                try:
                    self.senTokenizer = stok_model
                except LookupError:
                    sys.stderr.write("WARNING: tokenizer cannot be loaded")
                    sys.stderr.write(
                        "WARNING: using an untrained sen_tokenizer")
                    self.senTokenizer = PunktSentenceTokenizer()
            else:
                try:
                    self.senTokenizer = nltk.data.load(
                        'tokenizers/punkt/english.pickle')
                except LookupError:
                    sys.stderr.write(
                        "WARNING: english tokenizer cannot be loaded, nltk.data does not contain in!"
                    )
                    sys.stderr.write(
                        "WARNING: using an untrained sen_tokenizer")
                    self.senTokenizer = PunktSentenceTokenizer()

        self.abbrev_set = (set(abbrev_set)
                           if abbrev_set is not None else set())

        if pos:
            if pos_model is not None:
                self.posTagger = HunposTagger(pos_model, encoding="utf-8")
            else:
                self.posTagger = HunposTagger(os.path.join(
                    os.environ['HUNPOS'], 'english.model'),
                                              encoding="utf-8")
        if stem:
            self.stemmer = WordNetLemmatizer()

    def tokenize(self, raw):
        """Runs sentence and then word tokenization. Does some abbreviation-
        detection to fix false sentence endings."""
        sentences = self.sen_tokenize(raw)
        tokens = [self.word_tokenize(sen) for sen in sentences]
        for i in reversed(xrange(len(tokens) - 1)):
            if (self.is_abbrev(tokens[i][-1]) or
                    NltkTools._abbrevPattern.match(tokens[i][-1]) is not None
                    and not NltkTools.starts_with_upper(tokens[i + 1][0])):
                tokens[i].extend(tokens[i + 1])
                tokens.pop(i + 1)
        return tokens

    def sen_tokenize(self, raw):
        """Tokenizes the raw text into sentences."""
        raw = NltkTools.cleanup_puncts(raw)
        return self.senTokenizer.tokenize(raw)

    def filter_long_sentences(self, raw, length=1024):
        """Filters "sentences" (non-whitespace character sequences longer than
        length) from the text."""
        # TODO: This looks nice but it is too generous with memory use
        return ' '.join(
            filter(lambda x: len(x) <= length, re.split(r"\s+", raw)))

    def sen_abbr_tokenize(self, raw):
        """Tokenizes the raw text into sentences, and tries to handle problems
        caused by abbreviations and such."""
        sentences = self.sen_tokenize(raw)
        for i in reversed(xrange(len(sentences) - 1)):
            if (NltkTools._abbrevPattern.search(sentences[i]) is not None
                    and not NltkTools.starts_with_upper(sentences[i + 1])):
                sentences[i] = ' '.join(sentences[i:i + 2])
                sentences.pop(i + 1)
        return sentences

    @staticmethod
    def starts_with_upper(text):
        """Checks if the sentence starts with an upper case letter."""
        t = text.lstrip()
        return len(t) > 0 and t[0].isupper()

    @staticmethod
    def cleanup_puncts(raw):
        pos = 0
        cleaner = NltkTools._cleanerPattern.search(raw[pos:])
        while cleaner:
            if cleaner.group(2) == "." and not cleaner.group(3)[0].isupper():
                pos = cleaner.end()
            elif cleaner.group(1)[-1].isdigit() and cleaner.group(
                    3)[0].isdigit():
                pos = cleaner.end()
            else:
                changed_part_string = cleaner.expand(r"\1\2 \3\4")
                raw = raw[:cleaner.start(
                )] + changed_part_string + raw[cleaner.end():]
                pos = cleaner.end()
            cleaner = NltkTools._cleanerPattern.search(raw, pos)
        return raw

    def is_abbrev(self, tok):
        return tok in self.abbrev_set

    def word_tokenize(self, sen):
        """Tokenizes the sentence to words and splits the sentence ending
        punctuation mark from the last word and adds it as the last token."""
        tokens = self.wordTokenizer.tokenize(sen)
        if len(tokens) == 0:
            return []
        punktMatchObject = self.punktSplitter.match(tokens[-1])
        if punktMatchObject is not None and not self.is_abbrev(tokens[-1]):
            tokens = tokens[:-1] + list(punktMatchObject.groups())
        return tokens

    def pos_tag(self, sentokens):
        return self.posTagger.tag(sentokens)

    def stem(self, tokens):
        return ((tok, pos, self.stemmer.lemmatize(tok, penn_to_major_pos[pos]))
                for tok, pos in tokens)

    def tag_raw(self, raw_text):
        """Convenience method for tagging (a line of) raw text. The NltkTools
        instance must have been initialized with C{pos=True, stem=True, tok=True}.
        It is a generator: returns attribute array of one word at a time. The
        attributes are the word, the pos tag and the stem."""
        sens = self.tokenize(raw_text)
        pos_tagged = list(self.pos_tag(sen) for sen in sens)
        stemmed = list(
            self.stem(pos_tagged_sen) for pos_tagged_sen in pos_tagged)
        for sen in stemmed:
            for tok in sen:
                yield tok
            yield []
        return
Exemple #14
0
 def _tokenize(self):
     tok=PunktWordTokenizer()
     #tok=TreebankWordTokenizer()
     split_whitespace=lambda: re.compile(r'(\s+)').split(re.sub(u"\."," .",self.text))
     return list(chain(*[s if s.isspace() else tok.tokenize(s) for s in split_whitespace()]))
Exemple #15
0
    args = parser.parse_args()

    logging_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=logging_level)

    if args.model:
        logging.debug('loading model...')
        hmm = load_model(args.model)

    if args.corpus:
        logging.debug('loading corpus...')
        corpus = open(args.corpus, 'rb').read()
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        word_detector = PunktWordTokenizer()
        sentences = sent_detector.tokenize(corpus.strip())
        words = [cleanup_words(word_detector.tokenize(s)) for s in sentences]

        logging.debug('training model...')
        trainer = nltk.tag.hmm.HiddenMarkovModelTrainer(states=range(8), symbols=symbols(words))
        hmm = trainer.train_unsupervised(sequences(words), max_iterations=5)

        logging.debug('saving model...')
        save_model(args.corpus + '.hmm', hmm)

    logging.debug('sampling model...')

    while(True):
        utterance = sample(hmm, random.randint(5, 15)) + '.'
        print utterance
        if args.speak:
            subprocess.call('say "{}!"'.format(utterance), shell=True)