Esempio n. 1
0
    def __call__(self,doc):

        sentences = sentence_tokenizer(doc)

        doc2 = []

        for sent in sentences:
            
            sent = [self.modify_word(w) for w in sent]
            doc2.append(' '.join(sent))

        doc2 = '\n'.join(doc2)

        return doc2
Esempio n. 2
0
    def __call__(self, doc):

        sentences = sentence_tokenizer(doc)

        doc2 = []

        for sent in sentences:

            sent = [self.modify_word(w) for w in sent]
            doc2.append(' '.join(sent))

        doc2 = '\n'.join(doc2)

        return doc2
Esempio n. 3
0
    def __call__(self,doc):

        sents = sentence_tokenizer(doc)

        doc2 = []
        for sent in sents:
            if not is_any_lowercase(sent):

                if len(sent)>4:
                    print "DECAPING: '{}'".format( ' '.join(sent) )
                    
                sent = map(unicode.lower, sent)

            doc2.append(' '.join(sent))

        doc2 = ' '.join(doc2)
        return doc2
Esempio n. 4
0
    def __call__(self, doc):

        sents = sentence_tokenizer(doc)

        doc2 = []
        for sent in sents:
            if not is_any_lowercase(sent):

                if len(sent) > 4:
                    print("DECAPING: '{}'".format(' '.join(sent)))

                sent = map(unicode.lower, sent)

            doc2.append(' '.join(sent))

        doc2 = ' '.join(doc2)
        return doc2
Esempio n. 5
0
    def __call__(self, text):
        '''
        Runs the parser.

        Args:
            text: a string document
        Returns:
            doc2: a string document
        '''

        sents = sentence_tokenizer(text)

        doc2 = []
        for sent in sents:
            if not is_any_lowercase(sent):

                if len(sent) > self.min_length:
                    self.logger.info("DECAPING: '{}'".format(' '.join(sent)))
                    sent = [x.lower() for x in sent]

            doc2.append(' '.join(sent))

        doc2 = ' '.join(doc2)
        return doc2
    for row in query:
        unicodedata.normalize("NFKD", row[0].strip())
        # Get rid of formatting and compact older reviews
        data.append(re.sub('\s+', ' ', row[0]))
    return data

pitchfork = normalize_corpus(pitchfork)


# Create sentence tokens using sentence_tokenizer from local tokenizer
# module. Stores sentence tokens in list of list of reviews
# Sentences object can be used for processing but we'll write out a text file
# Text file can be streamed to interator to feed gensim word2vec model
# Reading back in will also make sure we're fully unicode regularized

pitchfork_sentences = sentence_tokenizer(pitchfork)


# There are 503 sentences with escapes followed by non-word characters
# Hand sampling / search shows they are all embedded in the DB and site text
regex = re.compile(r'\\[a-z]')
errors = [i for i in pitchfork_sentences if regex.search(i)]
len(errors)

# 10 sentences have \t, javascript errors in old reviews,
# all quoting outside text. Those are of no consequence
regex = re.compile(r'\\t')
errors = [i for i in pitchfork_sentences if regex.search(i)]
len(errors)

pitchfork_sentences = [i for i in pitchfork_sentences if not regex.search(i)]