Exemple #1
0
def replace_person_mail_names(filename, outputfile=None, replacetoken='***'):
    """(str) -> None
	Replace all person names and begining of email adresses by replacetoken (defaut ***)
	if outputfile, the file is not rewritten but an new file output is written 
	"""
    with open(filename, 'r') as f:
        rawtext = f.read()
    tokens = word_tokenize(
        rawtext
    )  #"Mrs.Truc" -> ['Mrs.Truc'] and "*****@*****.**" -> ["nemo.nemo","@","xmail.com"]
    #	tokens = wordpunct_tokenize(rawtext)#"Mrs.Truc" -> ['Mrs', '.', 'Truc'] and "*****@*****.**" -> ["nemo",".","nemo","@","xmail",".","com"]
    chunked_tokens = ne_chunk(nltk.pos_tag(tokens))

    if not outputfile: outputfile = filename
    with open(outputfile, 'w') as f:
        prec, curr = '', ''
        for tok in chunked_tokens:
            if isinstance(tok, Tree) and tok.label(
            ) == 'PERSON':  #if the token is a person, replace by replacetoken
                curr = replacetoken
            elif isinstance(tok, Tree):
                curr = ' '.join(x[0] for x in tok)
            elif tok[
                    0] == '@':  #if the token is @, replace last token by replacetoken
                prec = replacetoken
                curr = '@'
            else:
                curr = tok[0]
            f.write(prec)
            if curr in PUNCTUATION or "'" in curr: prec = curr
            else: prec = ' ' + curr
        f.write(curr)
 def _process_simpleHash(self, simpleHash):
     # Extract entities from keys resulting from SimpleExtractor process_*
     entityHash = {}
     for data in simpleHash:
         occs = simpleHash[data]['occurences']
         proxLoc = simpleHash[data]['proxLoc']
         # Tokenize sentences
         for sent in tokenize_sentences(data):
             # Tokenize words
             tokens = tokenize_words(sent)
             # Tag words with Parts of Speech
             tagged = pos_tag(tokens)
             # Identify named entities
             entities = ne_chunk(tagged)
             for ent in entities:
                 if isinstance(ent, NLTKParseTree):
                     # Is it a wanted type?
                     if ent.node in self.types:
                         # Should we keep the PoS tag?
                         if self.keepPos:
                             txts = ['/'.join(token) for token in ent.leaves()]
                         else:
                             txts = [token[0] for token in ent.leaves()]
                         txt = ' '.join(txts)
                         new = {txt: {'text': txt,
                                      'occurences': occs,
                                      'proxLoc': proxLoc[:]}}
                         entityHash = self._mergeHash(entityHash, new)
     return entityHash
 def getPersonOrPlaceAnswers(self, mainObj):
     corrIndex = 0
     itr = 0
     while self.answer == "" and corrIndex < len(mainObj.sim) and itr < 4:
         currentPara = mainObj.paras[mainObj.sim[corrIndex][0]]
         simCoeff = self.getSentQueryCorrelation(mainObj.question,
                                                 mainObj.qv, currentPara)
         sentences = sent_tokenize(currentPara)
         answers = []
         for iSimilarSent in simCoeff:
             sent = sentences[iSimilarSent[0]]
             taggedSent = pos_tag(word_tokenize(sent))
             chunked = ne_chunk(taggedSent)
             temp = {}
             for chunk in chunked:
                 if type(chunk) == Tree:
                     temp[chunk.label()] = [c[0] for c in chunk]
             answers.append(temp)
         for entity in answers:
             if mainObj.question_type in entity.keys():
                 tAnswer = entity[mainObj.question_type][0]
                 if PorterStemmer().stem(
                         tAnswer.lower()) not in mainObj.qv.keys():
                     self.answer = tAnswer
                     break
         corrIndex += 1
         itr += 1
     return self.answer
Exemple #4
0
def extract_ner(text: str):
    chunks = []
    tree = ne_chunk(pos_tag(word_tokenize(text)))
    for leaf in tree:
        if hasattr(leaf, 'label'):
            chunks.append([leaf.label(), ' '.join(c[0] for c in leaf)])
    return chunks
Exemple #5
0
def replace_person_names_version2(filename,
                                  outputfile=None,
                                  replacetoken='***'):
    """(str) -> None
	Replace all person names and begining of email adresses by replacetoken (defaut ***)
	if outputfile, the file is not rewritten but an new file output is written 
	"""
    with open(filename, 'r') as f:
        rawtext = f.read()
    tokens = word_tokenize(
        rawtext
    )  #"Mrs.Truc" -> ['Mrs.Truc'] and "*****@*****.**" -> ["nemo.nemo","@","xmail.com"]
    #	tokens = wordpunct_tokenize(rawtext)#"Mrs.Truc" -> ['Mrs', '.', 'Truc'] and "*****@*****.**" -> ["nemo",".","nemo","@","xmail",".","com"]
    tokens_with_pos = nltk.pos_tag(tokens)
    chunked_tokens = ne_chunk(tokens_with_pos)
    if not outputfile: outputfile = filename
    with open(outputfile, 'w') as f:
        for tok in chunked_tokens:
            if isinstance(tok, Tree) and tok.label(
            ) == 'PERSON':  #if the token is a person, replace by replacetoken
                f.write(' ' + replacetoken)
            elif isinstance(tok, Tree):
                f.write(' ' + ' '.join(x[0] for x in tok))
            elif tok[0] in PUNCTUATION or "'" in tok[0]:
                f.write(tok[0])
            else:
                f.write(' ' + tok[0])
Exemple #6
0
def frequency():
    valid_input = False
    while not valid_input:
        tips = input("Share your beauty tips: ")
        valid_input = validate_skin_input(tips)

    words_tokenize = word_tokenize(tips)
    words_tagged = pos_tag(words_tokenize, tagset="universal")

    print("\nPart of speech words")
    for word in words_tagged:
        print(f"{word[0]}: {word[1]}")

    print("\nFrequency Distributions")
    freq_dist = FreqDist(words_tokenize)
    for fd, count in freq_dist.most_common():
        print(f"{fd}: {count}")

    user_input = ""
    while user_input is not "Y" and user_input is not "N":
        user_input = input(
            "Do you want to show parse tree? [Y|N, case sensitive]: ")

    if user_input == "Y":
        ner = ne_chunk(words_tagged)
        ner.draw()
    else:
        pass
Exemple #7
0
def tag_with_NLTK(text_and_id):
    text, id = text_and_id
    set_entities_tag = {(' '.join(c[0] for c in chunk), chunk.label())
                        for chunk in ne_chunk(pos_tag(word_tokenize(text)))
                        if hasattr(chunk, 'label')}

    return (set_entities_tag, id)
Exemple #8
0
def extract_entities(words):
    entities = []
    for chunk in ne_chunk(pos_tag(words)):
        if hasattr(chunk, 'node'):            
            performer = ' '.join(c[0] for c in chunk.leaves())
            entities.append(performer.lower())
    return entities
Exemple #9
0
 def postags(self, doc):
     for f in ['title', 'desc', 'text']:
         [
             doc.postags[f].extend(pos_tag(sentence))
             for sentence in doc.tokens[f]
         ]
         doc.entities[f] = [
             c for c in ne_chunk(doc.postags[f], binary=True)
             if hasattr(c, '_label')
         ]
         doc.entities[f] = list(
             set([
                 ' '.join([l[0] for l in e.leaves()])
                 for e in doc.entities[f]
             ]))
         doc.topmod[f] = [
             t for t in doc.postags[f] if t[1] in self.topmod_list
         ]
         doc.topmod[f] = [(self.replace_list[t[0]],
                           t[1]) if t[0] in self.replace_list.keys() else
                          (t[0], t[1]) for t in doc.topmod[f]]
         doc.topmod[f] = [
             t for t in doc.topmod[f]
             if lower(t[0]) not in stopwords.words('english') + self.punct +
             self.remove_list[0] and t[1] not in self.remove_list[1]
         ]
         doc.postags[f] = [(self.replace_list[t[0]],
                            t[1]) if t[0] in self.replace_list.keys() else
                           (t[0], t[1]) for t in doc.postags[f]]
         doc.postags[f] = [
             t for t in doc.postags[f]
             if lower(t[0]) not in stopwords.words('english') + self.punct +
             self.remove_list[0] and t[1] not in self.remove_list[1]
         ]
def nltk_entity_groups(text):
    """Return all contiguous NER tagged chunks by NLTK."""
    parse_tree = ne_chunk(pos_tag(word_tokenize(text)))
    ner_chunks = [
        ' '.join([l[0] for l in t.leaves()]) for t in parse_tree.subtrees()
        if t.label() != 'S'
    ]
    return ner_chunks
Exemple #11
0
 def chunk(self, tags):
     # 개체명 인식 (nltk에서 기본 지원하는 ne_chunk 사용)
     r = []
     for chunk in ne_chunk(tags):
         if hasattr(chunk, 'label'):
             r.append(' '.join(c[0] for c in chunk))
     r = ' '.join(r).split(' ')
     return r
Exemple #12
0
def mywarWritten2(*args):

    sents = tokenize.sent_tokenize(myvar.get())

    sent = tokenize.word_tokenize(sents[int(myvar2.get())])
    tagged_sent = tag.pos_tag(sent)
    #print(tagged_sent)
    tree = chunk.ne_chunk(tagged_sent)
    tree.draw()
def find_named_entity(sentence, word_tokenizer):
    # U+0027 (') and U+2019 (’)
    sentence = sentence.replace('\u2019', '\u0027')
    default_count = 0
    default_entity_word = []
    for chunk in ne_chunk(pos_tag(word_tokenize(sentence))):
        if hasattr(chunk, 'label'):
            #print(chunk.label(), ' '.join(c[0] for c in chunk.leaves()))
            default_entity_word.append(chunk.label() + ' ' + ' '.join(c[0] for c in chunk.leaves()))
            default_count += 1
    custom_count = 0
    custom_entity_word = []
    for chunk in ne_chunk(pos_tag(word_tokenizer.tokenize(sentence))):
        if hasattr(chunk, 'label'):
            #print(chunk.label(), ' '.join(c[0] for c in chunk.leaves()))
            custom_entity_word.append(chunk.label() + ' ' + ' '.join(c[0] for c in chunk.leaves()))
            custom_count += 1
    return[default_count, custom_count, default_entity_word, custom_entity_word]
	def postags(self, doc):
		for f in ['title', 'desc', 'text']:
			[doc.postags[f].extend(pos_tag(sentence)) for sentence in doc.tokens[f]]
			doc.entities[f] = [c for c in ne_chunk(doc.postags[f], binary=True) if hasattr(c, '_label')]
			doc.entities[f] = list(set([' '.join([l[0] for l in e.leaves()]) for e in doc.entities[f]]))
			doc.topmod[f] = [t for t in doc.postags[f] if t[1] in self.topmod_list]			
			doc.topmod[f] = [(self.replace_list[t[0]], t[1]) if t[0] in self.replace_list.keys() else (t[0], t[1]) for t in doc.topmod[f]]
			doc.topmod[f] = [t for t in doc.topmod[f] if lower(t[0]) not in stopwords.words('english')+self.punct + self.remove_list[0] and t[1] not in self.remove_list[1]]
			doc.postags[f] = [(self.replace_list[t[0]], t[1]) if t[0] in self.replace_list.keys() else (t[0], t[1]) for t in doc.postags[f]]
			doc.postags[f] = [t for t in doc.postags[f] if lower(t[0]) not in stopwords.words('english')+self.punct + self.remove_list[0] and t[1] not in self.remove_list[1]]
def menu_2():
    tokenized = word_tokenize(user_sentences)
    pt = pos_tag(tokenized)
    fd = FreqDist(tokenized).most_common()

    print('Part of Speech')
    for word, tag in pt:
        print(word, '-', tag)

    print('Frequency')
    for word, count in fd:
        print(word, '-', count)

    while True:
        show = input('Want draw show tree Y/N : ')
        if show == 'Y' or show == 'N':
            break
   
    if show == 'Y':
        ne_chunk(pt).draw()
def make_syntax_trees(sentences):
    trees = []
    i = 0  #to count where the program is at
    for sentence in sentences:
        print(i)
        words = tokenize.word_tokenize(sentence)
        tagged_sentence = tag.pos_tag(words)
        tree = chunk.ne_chunk(tagged_sentence)
        trees.append(tree)
        i += 1
    return trees
Exemple #17
0
def get_names(sentence):
    assert isinstance(sentence, list), "Sentence must be tokenized first"
    tagged_sent = nltk.tag.pos_tag(sentence)
    names = [
        i[0] for i in list(
            chain(*[
                chunk.leaves() for chunk in ne_chunk(tagged_sent)
                if isinstance(chunk, Tree)
            ]))
    ]
    possessives = [word for word in sentence if word.endswith("s'")]
    return names + possessives
def nltk_entity_groups(text):
    # Return all contiguous NER tagged chunks by NLTK.
    # https://www.nltk.org/book/ch07.html
    """nltk.ne_chunk: returns a nested nltk.tree.Tree object 
    so you would have to traverse the Tree object to get to the NEs.
    POS(part-of speech)-tagger: processes a sequence of words,
    and attaches a part of speech tag to each word
    """
    parse_tree = ne_chunk(pos_tag(word_tokenize(text)))
    ner_chunks = [' '.join([l[0] for l in t.leaves()])
                  for t in parse_tree.subtrees() if t.label() != 'S']
    return ner_chunks
    def get_nltk_vectors(self, texts: List[str]):
        # https://gist.github.com/japerk/1909413
        from textblob import TextBlob
        sid = self.nltk_sid
        vsid = self.vader_sid
        pdict = self.pdict
        n_tokens_in = self.n_tokens_in
        rake = self.rake_nltk
        nltk_texts = [fasttext.tokenize(text) for text in texts]
        textblob_sentiments = [[sentiment.polarity, sentiment.subjectivity] for sentiment in [TextBlob(text).sentiment for text in texts]]
        textblob_sentiments = torch.tensor(textblob_sentiments).unsqueeze(1).expand(len(texts), n_tokens_in, 2)
        textblob_sentiments = textblob_sentiments.to(get_device())

        mask = stack_and_pad_tensors(list(map(lambda x: torch.ones(len(x), dtype=int), nltk_texts)), n_tokens_in)
        mask = mask.to(get_device())
        mask = self.is_mask_em(mask)
        has_digit = stack_and_pad_tensors(
            list(map(lambda x: torch.tensor([has_digits(str(t)) for t in x]), nltk_texts)), n_tokens_in)
        has_digit = has_digit.to(get_device())
        has_digit = self.has_digit_em(has_digit)

        m = self.text_model
        nltk_emb = stack_and_pad_tensors([torch.tensor([m[t] for t in sent]) for sent in nltk_texts], n_tokens_in) # if t in m else np.zeros(m.vector_size)
        nltk_emb = nltk_emb.to(get_device())
        sid_vec = torch.tensor([list(sid.polarity_scores(t).values()) for t in texts])
        sid_vec = sid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, sid_vec.size(1))
        sid_vec = sid_vec.to(get_device())
        vsid_vec = torch.tensor([list(vsid.polarity_scores(t).values()) for t in texts])
        vsid_vec = vsid_vec.unsqueeze(1).expand(len(texts), n_tokens_in, vsid_vec.size(1))
        vsid_vec = vsid_vec.to(get_device())
        conlltags = [[ptags for ptags in nltk.tree2conlltags(ne_chunk(pos_tag(x)))] for x in nltk_texts]

        pos = stack_and_pad_tensors(
            list(map(lambda x: torch.tensor([pdict[tag.lower()] for token, tag, ne in x]), conlltags)), n_tokens_in)
        pos = pos.to(get_device())
        pos_emb = self.tag_em(pos)
        ner = stack_and_pad_tensors(
            list(map(lambda x: torch.tensor([pdict[ne.lower().split("-")[-1]] for token, tag, ne in x]), conlltags)), n_tokens_in)
        ner = ner.to(get_device())
        ner_emb = self.tag_em(ner)

        phrases = [get_rake_nltk_phrases(rake, t) for t in texts]

        key_wc_rake_nltk = [get_rake_nltk_wc(tokens, phr) for tokens, phr in zip(nltk_texts, phrases)]
        key_wc_rake_nltk = stack_and_pad_tensors(key_wc_rake_nltk, self.n_tokens_in)
        key_wc_rake_nltk = key_wc_rake_nltk.to(get_device())
        nltk_rake_vectors = self.key_wc_rake_nltk(key_wc_rake_nltk)

        result = torch.cat([vsid_vec, nltk_emb, textblob_sentiments, pos_emb, ner_emb, nltk_rake_vectors, sid_vec, mask, has_digit], 2)
        result = result.to(get_device())
        result = self.nltk_nn(result)
        return result
def get_location_entities(content):
    pos_sentences_tokens = prep_named_entities(content)
    named_entities = []
    locations = []
    for pos_tree in pos_sentences_tokens:
        ne_chunk_tree = ne_chunk(pos_tree)
        leaves = sub_leaves(ne_chunk_tree, "GPE")
        if leaves:
            named_entities.append(leaves)
    for sentence_entities in named_entities:
        for entity in sentence_entities:
            locations.append(" ".join([name[0] for name in entity]))
    return list(OrderedDict.fromkeys(locations))
def get_person_entities(content):
    pos_sentences_tokens = prep_named_entities(content)
    named_entities = []
    people = []
    for pos_tree in pos_sentences_tokens:
        ne_chunk_tree = ne_chunk(pos_tree)
        leaves = sub_leaves(ne_chunk_tree, "PERSON")
        if leaves:
            named_entities.append(leaves)
    for sentence_entities in named_entities:
        for entity in sentence_entities:
            people.append(" ".join([name[0] for name in entity]))
    return list(OrderedDict.fromkeys(people))
	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			sentences = pos.tokenize_sents(doc["cleansed_text"])
			tags = pos.tokenize_words(sentences)
			for sent in tags:
				tagged_sent = tagger.tag(sent)
				d = ne_chunk(tagged_sent)
				chunks = tree2conlltags(d)
				print(chunks)
			if ind == 10:
				break
def get_all_named_entities(content):
    pos_sentences_tokens = prep_named_entities(content)
    named_entities = []
    nouns = []
    for pos_tree in pos_sentences_tokens:
        # binary = True grabs all named entities instead of classifying person, location, etc.
        ne_chunk_tree = ne_chunk(pos_tree, binary=True)
        leaves = sub_leaves(ne_chunk_tree, "NE")
        if leaves:
            named_entities.append(leaves)
    for sentence_entities in named_entities:
        for entity in sentence_entities:
            nouns.append(" ".join([name[0] for name in entity]))
    return list(OrderedDict.fromkeys(nouns))
Exemple #24
0
def print_names(filename):
    """(str) -> None
	Find all the names into a text file and print them into the terminal
	"""
    with open(filename, 'r') as f:
        rawtext = f.read()
        tokens = word_tokenize(rawtext)
        tokens_with_pos = nltk.pos_tag(tokens)
        print("---------------------")
        print("Named entities")
        print()
        named_entities = [chunk for chunk in ne_chunk(tokens_with_pos)
                          ]  # if isinstance(chunk, Tree)]
        print(tokens_with_pos)
        print(named_entities)
 def test_interactive(self):
     docs = self.source.find_clean(batch_size=1000)
     tagger = ngrams.make_backoff_tagger()
     print()
     for ind, doc in docs:
         sentences = pos.tokenize_sents(doc["cleansed_text"])
         tags = pos.tokenize_words(sentences)
         for sent in tags:
             tagged_sent = tagger.tag(sent)
             d = ne_chunk(tagged_sent)
             chunks = tree2conlltags(d)
             print("CHUNKS" + str(chunks))
             print("NE" + str(cnll.get_ne(chunks)))
             print("NOUNS" + str(cnll.get_nouns(chunks)))
         if ind == 10:
             break
Exemple #26
0
def nltk_method(str):
    # INSTALL
    # nltk.download()
    # nltk.download('averaged_perceptron_tagger')
    # nltk.download('maxent_ne_chunker')
    # nltk.download('words')

    ex = str
    u = unicode(ex, 'utf-8')
    pattern = 'NP: {<DT>?<JJ>*<NN>}'
    sent = preprocess(ex)
    cp = nltk.RegexpParser(pattern)
    cs = cp.parse(sent)
    iob_tagged = tree2conlltags(cs)
    ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
    print(ne_tree)
def get_named_entities(sentence):
  """
  Get named entities from a sentence.
  """
  tokens = wordpunct_tokenize(sentence)
  posTaggedTokens = pos_tag(tokens)
  tree = ne_chunk(posTaggedTokens)
  subtrees = dropFirst(tree.subtrees())
  entities = defaultdict(int)
  for subtree in subtrees:
    # We could add the entity type here (e.g. PERSON, ORGANIZATION)
    # entity = subtree.node + " "
    #
    words = [word for (word,pos) in subtree if word != "FTP"]
    if len(words) > 0:
      entities[" ".join(words)] += 1
  return entities
Exemple #28
0
def data(fname):
    reader = csv.DictReader(open(fname))
    idx = 0
    for row in reader:
        if idx % 100 == 0: print "Row %d" % idx
        row["words_basic"] = re.findall(r'\w+', row["Comment"])
        row["words_nltk"] = word_tokenize(row["Comment"])
        row["pos_tag"] = []
        row["ne_tags"] = []
        for sent in sent_tokenize(row["Comment"]):
            words = tagger.tag(word_tokenize(sent))
            row["pos_tag"].extend([wd[1] for wd in words])
            for x in ne_chunk(words):
                if x.__class__.__name__ == "Tree":
                    row["ne_tags"].append(x.node)
        yield row
        idx += 1
def clean_dict(doc, tagger=nltk.pos_tag):
    """ Processes NLP features from cleansed_text. All other functions
	wrap this one. 
	Serves to act as the NLP-front end for reddit corpus
	parsing. Dictionaries and json strings are accepted and return
	dictionaries containing additional information. The processing
	done here represents the general annotations. The following
	are the new fields added to the dictionary. Classifiers
	will work to modify or wrap these methods. 

	::

		{
			conlltags 		: [[(word, pos, BIO)]],
			nouns 			: [word],
			named_entities 		: [[word, pos, BIO]],
			cleansed_text 		: [[word]]
		}

	:param doc: dictionary of reddit corpus.
	:type doc: dict

	:param tagger: A pos tagger. 
	:type tagger: Tagger

	:returns: dict
	"""

    if "_id" in doc:
        del (doc["_id"])
    sentences = pos.tokenize_sents(doc["cleansed_text"])
    tags = pos.tokenize_words(sentences) or []
    doc["conlltags"] = []
    doc["nouns"] = []
    doc["named_entities"] = []
    for sent in tags:
        tagged_sent = nltk.pos_tag(sent) or []
        d = ne_chunk(tagged_sent) or []
        chunks = tree2conlltags(d)
        doc["conlltags"].append(chunks)
        doc["nouns"].extend(cnll.get_nouns(chunks))
        doc["named_entities"].extend(cnll.get_ne(chunks))
    return doc
Exemple #30
0
def ne_removal(text):
    '''
        The ne_removal funcition recieves a text corpus, tokenize it, set parts of
        speach and name entities. Then, remove the name entities from the
        tokenize text.

        INPUT:
        text = string with text

        OUTPUT:
        tokens_no_ne = array of tuple with token and part of speach
    '''
    raw_tokens = word_tokenize(text)

    chunked = ne_chunk(pos_tag(raw_tokens))

    tokens_no_ne = [leaf for leaf in chunked if (type(leaf) != nltk.Tree)]

    return tokens_no_ne
 def extract_names(self):
     names = []
     for sentence in self.sentences:
         text = word_tokenize(sentence)
         tags = nltk.pos_tag(text)
         # for chunk in ne_chunk(tags):
         # if isinstance(chunk, Tree):
         # print chunk
         for i in list(
                 chain(*[
                     chunk.leaves() for chunk in ne_chunk(tags)
                     if isinstance(chunk, Tree)
                 ])):
             names.extend(i)
     unique_names = list(set(names))
     unique_names.remove("NNS")
     unique_names.remove("NNP")
     unique_names.remove("NNPS")
     print "unique names: ", unique_names
     return unique_names
Exemple #32
0
def get_names_entities_list(tokens):
    """(nltk.tokens) -> list
	Return a list of the person names among the tokens"""
    words_pos_tags = nltk.pos_tag(tokens)
    named_entities = [
        chunk for chunk in ne_chunk(words_pos_tags) if isinstance(chunk, Tree)
    ]

    named_entities_list = []

    for entity in named_entities:
        if entity.label() == 'PERSON':
            person_name = ""
            for word, postag in entity.leaves():
                person_name += word + " "
            named_entities_list.append(person_name)

    named_entities_list = list(set(named_entities_list))
    named_entities_list = [entity.strip() for entity in named_entities_list]
    return named_entities_list
Exemple #33
0
def NER(entry):
    global nouns, person, gpe, org, loc, mon
    global postags
    word_tokens = nltk.word_tokenize(entry)
    postags = nltk.pos_tag(word_tokens)
    nouns = ne_chunk(postags)
    for t in str(nouns).split("\n"):
        if "NNP" in t:
            if "PERSON" in t:
                a = t.split("/")
                b = a[0].split(" ")
                person.append(b[3])
                person = set(person)
                person = list(person)
            elif "GPE" in t:
                a = t.split("/")
                b = a[0].split(" ")
                gpe.append(b[3])
                gpe = set(gpe)
                gpe = list(gpe)
            elif "ORGANIZATION" in t:
                a = t.split("/")
                b = a[0].split(" ")
                org.append(b[3])
                org = set(org)
                org = list(org)
            elif "LOC" in t:
                a = t.split("/")
                b = a[0].split(" ")
                loc.append(b[3])
                loc = set(loc)
                loc = list(loc)
        elif "CD" in t:
            a = t.split(" ")
            b = a[2].split("/")
            mon.append(b[0])
            mon = set(mon)
            mon = list(mon)
    show()
    return 0
Exemple #34
0
 def get_ne(self, post_taged_sents, sent_tokenized_sents, chunker, mark):
     """
     This function gets all the name entity in the text.
     Parameters:
         post_taged_sents (list) - pos taged sentece
         sent_tokenized_sents (list) - sentenced tokenized text
         mark (string) - 'Q' means question; 'T' means text. To decide which chunker is used
     Variables:
         name_entity_sents (list) - the list of all the name entities in the text, 
                                    the element of the list is the list of the ne 
                                    of each sentence
     return name_entity_sents
     """
     name_entity_sents = []
     idx = 0
     for pos_tag_sent in post_taged_sents:
         #print(len(post_taged_sents))
         #print()
         #print(pos_tag_sent)
         #print(idx)
         if re.findall(r'[A-Z]&[A-Z]', sent_tokenized_sents[idx]):
             ne_abbred = re.findall(r'[A-Z]&[A-Z]',
                                    sent_tokenized_sents[idx])
             #name_entity_sents.append(ne_abbred)
         else:
             ne_abbred = []
         if mark == 'Q':
             ne_chunk = chunk.ne_chunk(pos_tag_sent, binary=True)
             #name_entity_sents.append(get_name_entity(get_name_entity_list(ne_chunk)))
         if mark == 'T':
             ne_chunk = chunker.parse(pos_tag_sent)
             #name_entity_sents.append(get_name_entity(get_name_entity_list(ne_chunk)))
         #print(ne_chunk)
         ne_chunks = ne_abbred + self.get_name_entity(
             self.get_name_entity_list(ne_chunk))
         name_entity_sents.append(ne_chunks)
         idx += 1
     return name_entity_sents
Exemple #35
0
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    sent = ne_chunk(sent)
    return sent
      (genre, word)
      for genre in brown.categories()
      for word in brown.words(categories=genre))
cfd.tabulate(conditions=genres, samples=modals)

#                 can could  may might must will
#           news   93   86   66   38   50  389
#       religion   82   59   78   12   54   71
#        hobbies  268   58  131   22   83  264
#science_fiction   16   49    4   12    8   16
#        romance   74  193   11   51   45   43
#          humor   16   30    8    8    9   13

## Example Taken from Natural Language Processing with Python
## (Steve Bird, Ewan Klein, and Edward Loper 2009)
## Available at http://nltk.org/book/

from nltk import ngrams, wordpunct_tokenize
from ntlk.tag import pos_tag
from nltk.stem.porter import PorterStemmer
from nltk.chunk import ne_chunk

s = "The Democrat admitted he's eying the mayor's race in a lengthy New York Times Magazine article that detailed his life with wife Huma Abedin, a close aide to former Secretary of State Hillary Clinton, and their attempts to stay out of the limelight over the past two years-until now."

tokens = wordpunct_tokenize(s)
st = PorterStemmer()
stemmed = [st.stem(w) for w in tokens]
pos = pos_tag(tokens)
tree = ne_chunk(pos)
tree.draw()
Exemple #37
0
def ne_chucking():
    tree = chunk.ne_chunk(part_of_speech_tagging())
Exemple #38
0
from nltk.chunk import ne_chunk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

in_filepath = 'E:\Subash\AI\data_in.txt'

fldetails = open(in_filepath, 'r')
for lines in fldetails.readlines():
    tokenizing = word_tokenize(lines)
    postagging = pos_tag(tokenizing)
    chunk_Sent = ne_chunk(postagging)
    print chunk_Sent
    chunk_Sent.draw()
Exemple #39
0
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.chunk import ne_chunk
import wikipedia

e = ""

topic = "Python (programming language)"
# topic = 'python'

try:
    entity = str(wikipedia.summary(topic, sentences = 4).encode('utf-8'))

    tokens = word_tokenize(entity)
    gmrTags = pos_tag(tokens)
    gmrChunks = ne_chunk(gmrTags, binary = True)

    print("Topic summary {}".format(topic))
    print entity
    print("= = = =")
    print("Topic has these noun phrases in 4 sentence summary: ")

    gmrNouns = []
    gmrPrev = None
    gmrPhrase = []

    for (token, pos) in gmrTags:
        if pos.startswith('NN'):
            if pos == gmrPrev:
                gmrPhrase.append(token)

words_in_the_part_of_the_sentence = tokenize.word_tokenize(parts_of_the_sentence[2])
print("words_in_the_part_of_the_sentence")
print(words_in_the_part_of_the_sentence)


from nltk import tag
tagged_part_of_sentence_with_corresponding_syntactical_value = tag.pos_tag(words_in_the_part_of_the_sentence)
print("tagged_part_of_sentence_with_corresponding_syntactical_value")
print(tagged_part_of_sentence_with_corresponding_syntactical_value)


from nltk import chunk
from nltk.tree import Tree
tree = chunk.ne_chunk(tagged_part_of_sentence_with_corresponding_syntactical_value)
print("tree")
print(tree)
#tree.draw()


the_tree = Tree("sentence", tokenize.sent_tokenize(initial_sentence))

#the_tree.draw()


print(the_tree)

list_of_sentences = []
for i in tokenize.sent_tokenize(initial_sentence):
    list_of_sentences.append(tag.pos_tag(tokenize.word_tokenize(i)))
def named_entities(tagged_body):
  body = ne_chunk(tagged_body)
  return body
Exemple #42
0
import cPickle as pickle
from pprint import pprint
import os

from nltk import chunk, tokenize


d = """"Hello, Martha. It isn't cocktail-time yet, is it?" The girl at the
table spoke without raising her head, almost without moving her lips, as
though she were afraid that the slightest breath would disturb the flaky
stuff in front of her."""


def tagLoader():
    data_dir = os.path.expanduser("~/.cwethan")
    tagger_cache_file = os.path.join(data_dir, "tagger.pkl")
    fh = open(tagger_cache_file, "rb")
    tagger = pickle.load(fh)
    fh.close()
    return tagger


tagger = tagLoader()
sent = tokenize.word_tokenize(d)
tags = tagger.tag(sent)
print chunk.ne_chunk(tags)
Exemple #43
0
	if (tag == 'NNP'):
	
		value = "%s" % word
		nlist.append(value)
		
	if (tag == 'NNS'):
	
		value = "%s" % word
		nlist.append(value)


print(nlist)

#named entities

nen = ne_chunk(rtag)

print(nen)

#putting all the noun phrases in a file

f= open("noun_all.csv", "w")

f.write("\n".join(nlist))

f.close()

#extracting capitalized words

capword = RegexpTokenizer('[A-Z]\w+')
Exemple #44
0
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
wordnet_lemmatizer.lemmatize('dogs')

from nltk.tokenize import sent_tokenize
sent_tokenize("Hello SF Python. This is NLTK. Its a good library.")

from nltk.tokenize import word_tokenize
word_tokenize('This is NLTK.')

from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize("What's up?")

words = word_tokenize("And now for something completely different")
from nltk.tag import pos_tag
pos_tag(words)

from nltk.chunk import ne_chunk
ne_chunk(pos_tag(word_tokenize('My name is Jacob Perkins.')))

ne_chunk(pos_tag(word_tokenize('San Francisco is foggy.')))

def bag_of_words(words):
    return dict([(word, True) for word in words])

feats = bag_of_words(word_tokenize("great movie"))
import nltk.data

classifier = nltk.data.load('classifiers/movie_reviews_NaiveBayes.pickle')
classifier.classify(feats)
Exemple #45
0
def nltk_entity_groups(text):
    """Return all contiguous NER tagged chunks by NLTK."""
    parse_tree = ne_chunk(pos_tag(word_tokenize(text)))
    ner_chunks = [' '.join([l[0] for l in t.leaves()])
                  for t in parse_tree.subtrees() if t.label() != 'S']
    return ner_chunks
	tokens = word_tokenize(file_y)
	for value in tokens:
		all_toks_class.append(value)

good_sent = [w.encode('ascii', 'replace') for w in all_sent if len(w) >= 10 and len(w) < 100]
good_toks = [w for w in all_toks_class if not w.lower() in stopset and not w.isdigit() and w.isalpha() and len(w) >= 4 and len(w) < 125]

fdist1 = FreqDist(good_toks)
most = fdist1.most_common(100)
list_values = list()
for word in most:
	list_values.append(word[0])

st = NERTagger('./stanford-ner/english.all.3class.distsim.crf.ser.gz','./stanford-ner/stanford-ner.jar')
tagged_words = st.tag(list_values)

tag_words = list()
for word in list_values:
	tag_words = tag_words + tag.pos_tag(word)

print "CHUNK WORDS:"
tree = chunk.ne_chunk(tagged_words)
print tree.draw

print "STANDFORD WORDS:"
for word in tagged_words:
	if (word[1] != 'O'):
		print word


from nltk.chunk.util import tree2conlltags
import rdt.nlp.conll_get as cnll

if __name__ == "__main__":
    source = rdtcorp.Source(conf_key="source_test")
    annotated = rdtcorp.Source(conf_key="annotated_test")
    docs = source.find()
    docs.batch_size(1000)
    tagger = ngrams.make_backoff_tagger()
    buf = []
    for ind, doc in enumerate(clean.doc_iter(docs)):
        del (doc["_id"])
        sentences = pos.tokenize_sents(doc["cleansed_text"])
        tags = pos.tokenize_words(sentences)
        doc["conlltags"] = []
        doc["nouns"] = []
        doc["named_entities"] = []
        for sent in tags:
            tagged_sent = tagger.tag(sent)
            d = ne_chunk(tagged_sent)
            chunks = tree2conlltags(d)
            doc["conlltags"].append(chunks)
            doc["nouns"].extend(cnll.get_nouns(chunks))
            doc["named_entities"].extend(cnll.get_ne(chunks))
        buf.append(doc)
        if ind % 1000:
            annotated.insert(buf)
            buf = []
    if buf:
        annotated.insert(buf)
Exemple #48
0
def test_ne_chunk(sent):
    from nltk.chunk import ne_chunk

    print ne_chunk(sent)
Exemple #49
0
#This is based on
import sys
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk import chunk
file_name = sys.argv[1]
file = open(file_name, 'r', encoding='utf8')
text = ""
for line in file:
    text += line
file.close()
sentence_lst = sent_tokenize(text)
tagged_sentences = []
for sentence in sentence_lst:
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    tagged_sentences.append(tagged)
for tagged_sentence in tagged_sentences:
    tree = chunk.ne_chunk(tagged_sentence)
    print(tree)
Exemple #50
0
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import conlltags2tree, tree2conlltags, ne_chunk
import spacy
from pprint import pprint

ex = 'Golden Diner is a little spot in Two Bridges that serves modern diner classics. It’s worth planning lunch here a week in advance.'
ex = 'Troubled burger group Byron will launch a new brand concept and menu on November 21. The 53-strong restaurant chain, which launched 12 years ago to much fanfare, narrowly escaped collapse amid the casual dining crunch, but has come out fighting with a bold new strategy. As much as £15m will be invested in the brand […]'


def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent


sent = preprocess(ex)
# print(sent)

pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)
Exemple #51
0
    file = open(file_path, "w+", encoding="utf-8")

    # Document level
    for doc_idx, text in enumerate(cleantexts):

        #if ( (doc_idx + 1) % 1) == 0:
        #    print("Processing document %d out of %d" % (doc_idx+1,n_docs))

        doc_key = doc_ids[doc_idx]

        # NER
        if tagger == "NLTK":
            # set_entities_tag = tag_with_NLTK(text)
            set_entities_tag = {
                (' '.join(c[0] for c in chunk), chunk.label())
                for chunk in ne_chunk(pos_tag(word_tokenize(text)))
                if hasattr(chunk, 'label')
            }
        #elif tagger == "Spacy":
        #    set_entities_tag = {(ent.text.strip(), ent.label_) for ent in spacy_model(text).ents if
        #                        ent.label_ not in skip}
        # elif tagger == "Stanford":
        # set_entities_tag = tag_with_stanford(text)

        # print(set_entities_tag)

        # Go through each entity found
        for word, label in set_entities_tag:

            # Skip entity if have following constraints
            if len(word) < 1 or check_skip_constraints(
Exemple #52
0
# load tagger from storage
f = open( 'tagger.pickle', 'r' )
tagger = pickle.load( f )

# If your tagger pickle file is located in a NLTK data directory, you could also
#    use nltk.data.load('tagger.pickle')

#--------------------------------------------------------------------------------
# Named Entity Chunking
#--------------------------------------------------------------------------------

# Need to start experimenting with chunkers.
# In packt book, NER is on page 133.

# Once you have parsed parts of speech from your sentences, then you can look
#    for names.  Must pass the ne_chunk method a list of tagged words, though,
#    not just a list of tokens.
from nltk.chunk import ne_chunk
ne_chunk( tagged_words )

# todo: figure out how to deal with the output of chunker for named-entity
#    recognition.

# todo: figure out how to detect, filter said verbs - need to go through
#    articles, find all said verbs, make a custom classifier that can tag them
#    differently.

# todo: need a way to take said verbs and proper names and see if they are
#    proximal in a given sentence - if not within 4 or 5 words, might not be
#    attribution.