def main(): # FILENAME = "CellPhoneReview-1000.json" # print('Reading data...') # review_data = open(FILENAME).readlines() # document = [json.loads(d)['reviewText'] for d in review_data][0] document = "These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!" print(document) nltk_tagger = NLTKTagger() extractor = ConllExtractor() blob = TextBlob(document, pos_tagger=nltk_tagger, np_extractor=extractor) print(blob.tags) print(blob.noun_phrases) pattern_tagger = PatternTagger() blob2 = TextBlob(document, pos_tagger=pattern_tagger, np_extractor=extractor) print(blob2.tags) print(blob2.noun_phrases) tagged = nltk.pos_tag(tokenize(document.lower())) print(tagged) grammar = (''' NP: {<DT>?(<RB.?>*<VB.?>*<NNPS?>+<NNS?>+ | <JJ>*<NNS?>+)} # NP ''') chunkParser = nltk.RegexpParser(grammar) tree = chunkParser.parse(tagged) noun_phrases = [] for subtree in tree.subtrees(): if subtree.label() == 'NP': noun_phrase = ' '.join([elem[0] for elem in subtree]) noun_phrases.append(noun_phrase) print(noun_phrases)
'detector': TwitterDetector, 'autoload': True }, UrlDetector.name: { 'detector': UrlDetector, 'autoload': True }, # Detectors that are not automatically loaded by scrubadub KnownFilthDetector.name: { 'detector': KnownFilthDetector, 'autoload': False }, } # type: Dict[str, DetectorConfigurationItem] # BaseBlob uses NLTKTagger as a pos_tagger, but it works wrong BaseBlob.pos_tagger = PatternTagger() def register_detector(detector: Type[Detector], autoload: bool = False): """Register a detector for use with the ``Scrubber`` class. This is used when you dont want to have to import a detector by default. It may be useful for certain detectors with large or unusal dependancies, which you may not always want to import. In this case you can use ``register_detector(NewDetector, autoload=True)`` after your detector definition so that if the file is imported it wil be automatically registered. This will mean that you don't need to import the ``NewDetector`` in this file and so it's dependencies won't need to be installed just to import this package. The argument ``autoload`` sets if a new ``Scrubber()`` instance should load this ``Detector`` by default. """ detector_configuration[detector.name] = {
import helper import json import os import sqlite3 from textblob.en.taggers import PatternTagger from textblob.tokenizers import WordTokenizer tk = WordTokenizer() tagger = PatternTagger() # since lots of repeat words, we store an index to the actual token. keys = [] def key_to_int(key): try: return keys.index(key) except ValueError: keys.append(key) return len(keys) - 1 ntoken_freq = {} npos_freq = {} conn = sqlite3.connect("data.db") c = conn.cursor() USAGE_MINIMUM = 15 NTOKENS_PURGE_THRESHOLD = 5E6 # used to track progress posts_processed = 0
def spellCheck(text): language = getLanguage(text) b = TextBlob(text, pos_tagger=PatternTagger()) for word in b.words: print(word) print(word.spellcheck())