Ejemplo n.º 1
0
import textblob_aptagger
from textblob import TextBlob, Word
#########################################
# Start POS tagger
#########################################

pt = textblob_aptagger.PerceptronTagger()

#######################################
# Tag query
#######################################


def tagQuery(query):
    taggedquery = ""
    try:
        tags = pt.tag(query)
        if len(tags) > 0:
            for word in tags:
                surface = word[0]
                pos = word[1]
                #				print word
                try:
                    if pos[0] == 'N' or pos[0] == 'V':
                        tag = Word(surface).lemmatize(
                            pos[0].lower()) + "_" + pos[0]
                    else:
                        if pos[0] == 'J':
                            # Hack -- convert pos J to pos A because that's how
                            # adjectives are represented in dm file
                            tag = Word(surface).lemmatize().lower() + "_A"
Ejemplo n.º 2
0
                meta = json.loads(line)
                grafs = filter_quotes(meta["text"])

                if not grafs or len(grafs) < 1:
                    raise Exception("no results")
                else:
                    print grafs


######################################################################
## parse and markup text paragraphs for semantic analysis

PAT_PUNCT = re.compile(r'^\W+$')
POS_KEEPS = ['v', 'n', 'j', 'r']
POS_LEMMA = ['v', 'n']
TAGGER = tag.PerceptronTagger()
UNIQ_WORDS = {".": 0}


def get_word_id(root):
    """lookup/assign a unique identify for each word"""
    global UNIQ_WORDS

    # in practice, this should use a microservice via some robust
    # distributed cache, e.g., Cassandra, Redis, etc.

    if root not in UNIQ_WORDS:
        UNIQ_WORDS[root] = len(UNIQ_WORDS)

    return UNIQ_WORDS[root]