import textblob_aptagger from textblob import TextBlob, Word ######################################### # Start POS tagger ######################################### pt = textblob_aptagger.PerceptronTagger() ####################################### # Tag query ####################################### def tagQuery(query): taggedquery = "" try: tags = pt.tag(query) if len(tags) > 0: for word in tags: surface = word[0] pos = word[1] # print word try: if pos[0] == 'N' or pos[0] == 'V': tag = Word(surface).lemmatize( pos[0].lower()) + "_" + pos[0] else: if pos[0] == 'J': # Hack -- convert pos J to pos A because that's how # adjectives are represented in dm file tag = Word(surface).lemmatize().lower() + "_A"
meta = json.loads(line) grafs = filter_quotes(meta["text"]) if not grafs or len(grafs) < 1: raise Exception("no results") else: print grafs ###################################################################### ## parse and markup text paragraphs for semantic analysis PAT_PUNCT = re.compile(r'^\W+$') POS_KEEPS = ['v', 'n', 'j', 'r'] POS_LEMMA = ['v', 'n'] TAGGER = tag.PerceptronTagger() UNIQ_WORDS = {".": 0} def get_word_id(root): """lookup/assign a unique identify for each word""" global UNIQ_WORDS # in practice, this should use a microservice via some robust # distributed cache, e.g., Cassandra, Redis, etc. if root not in UNIQ_WORDS: UNIQ_WORDS[root] = len(UNIQ_WORDS) return UNIQ_WORDS[root]