def posNGram(self, sentence, n=2): #tokenized sentence #sentenceTokenize[1] ''' No. POS NGrams https://cs.nyu.edu/grishman/jet/guide/PennPOS.html ''' posDic = {} if n % 2 == 1: #odd POS for i in range(n - n // 2 - 1, len(sentence) - n // 2): posTuple = [] POS = posTag(sentence[i - n // 2:i + n // 2 + 1]) for p in POS: posTuple.append(p[1]) posTuple = tuple(posTuple) if posTuple not in posDic: posDic[posTuple] = 0 posDic[posTuple] = posDic[posTuple] + 1 return (posDic) else: #even POS for i in range(len(sentence) - n + 1): posTuple = [] POS = posTag(sentence[i:i + n]) for p in POS: posTuple.append(p[1]) posTuple = tuple(posTuple) if posTuple not in posDic: posDic[posTuple] = 0 posDic[posTuple] = posDic[posTuple] + 1 return (posDic)
def grabMySQLdocument(self, description): db = MySQLdb.connect("localhost","dondi","","nlpText" ) cursor = db.cursor() sql = """SELECT document FROM Documents WHERE description = '%s'""" % (description) cursor.execute(sql) result = (cursor.fetchall()) db.commit() db.close() result = result[0][0].replace('_', "'") words = wordTokenize(result) words = posTag(words) return words