def getSentenceFeatures(postDictList): nlp = English() poSpeechDictList = [] for postDict in postDictList: #sentence = constructSentence(postDict) #unicodeSentence = unicode(sentence, "utf-8") unicodeTokens = convertListInUnicode(postDict.values()) doc = nlp.tokenizer.tokens_from_list(unicodeTokens) nlp.parser(doc) nlp.tagger(doc) #formattedTokens = nlp(unicodeSentence) poSpeechDict = OrderedDict() index = 1 for token in doc: poSpeechDict[index] = [token.pos_, token.pos, token.tag, token.tag_, token.dep, token.dep_] index = index + 1 poSpeechDictList.append(poSpeechDict) return poSpeechDictList
import spacy from spacy.en import English import sys nlp = English() doc = nlp(sys.stdin.read().decode('UTF-8')) nlp.tagger(doc) with open("./pos.txt", "w") as f, open("posner.txt", "w") as f2: for word in doc: only_tags = word.text if word.pos_ in ('PROPN', 'NOUN', 'VERB'): only_tags += "_" + word.pos_ only_tags += " " f.write(only_tags.encode("UTF-8")) print doc.ents for ent in doc.ents: if ent.label_: ent.merge(ent.root.tag_, ent.text, ent.label_) for word in doc: if word.ent_type_: with_ner = "%s_%s" % (word.text.replace(" ", "_"), word.ent_type_) else: with_ner = word.text if word.pos_ in ('PROPN', 'NOUN', 'VERB'): with_ner += "_" + word.pos_ with_ner += " "
# In[4]: import spacy.util from spacy.en import English from spacy.syntax.parser import OracleError from spacy.syntax.util import Config # In[6]: nlp = English() # In[33]: tokens = nlp.tokenizer.tokens_from_list([u'Me', u'and', u'you, 'u'hate', u'pizza']) a=nlp.tagger(tokens) a=nlp.parser(tokens) # In[51]: print tokens[1].head # In[ ]: