Exemple #1
0
def nltk_extract_ner(text):
    """
    Use of NLTK NE
    :param text:
    :return: list of all extracted NE
    """
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)

    d = defaultdict(list)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label') and t.label:
            #if it is recognized as NE add with key of its type
            if t.label() in ne_types:
                d[t.label()].append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    for tree in chunked_sentences:
        # Get results per sentence
        extract_entity_names(tree)


    # return all entity names
    return d
Exemple #2
0
def extract_entity_names(text):
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def entity_names(t):
        names = []

        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    names.extend(entity_names(child))

        return names

    names = []
    for tree in chunked_sentences:
        # Print results per sentence
        # print extract_entity_names(tree)

        names.extend(entity_names(tree))

    return set(names)
def get_entities(story):
	entities = {}

	'''wrong code, before nltk.pos_tag(), 
		story need to be divide into sentences with',' and '.' using nltk.sent_tokenize(),
		then tokenize each sentence to tokens with ',' and '.' using nltk.word_tokenize.
	storytokens = tokenizer(story) #remove '\'', ',' and '.'
	pos_words = nltk.pos_tag(storytokens)
	'''

	sentences = nltk.sent_tokenize(story)
	tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
	#label 'Boy' and 'Scout' as 'NNP' respectively 
	tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
	#label 'Boy Scout' as 'NE'(entity)
	chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

	#
	entity_in_sentences = []
	for tree in chunked_sentences:
		#extract_entity_names(tree) find entities in each chunked_sentence
		entity_in_sentences.extend(extract_entity_names(tree))
	
	#delete repeat entities in all chunked_sentences
	entities_unique = set(entity_in_sentences)
	#create entities(dict object)
	i = 0
	for entity in entities_unique:
		entities[entity] = i
		i += 1

	return entities
Exemple #4
0
def nltkner(content):
    sentences = nltk.sent_tokenize(content.strip())
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)
    persons = []
    orgs = []
    locs = []
    gpes = []
    times = []
    moneys = []
    percents = []
    dates = []
    facilitys = []
    for tree in chunked_sentences:
        person, loc, org, gpe, percent, time, date, money, facility = extract_entity_names(
            tree)
        persons.extend(person)
        orgs.extend(org)
        gpes.extend(gpe)
        times.extend(time)
        moneys.extend(money)
        percents.extend(percent)
        dates.extend(date)
        facilitys.extend(facility)
    #return list(set(persons)),list(set(locs)),list(set(orgs)),list(set(gpes)),list(set(percents)),list(set(times)),list(set(datas)),list(set(moneys)),list(set(facilitys))
    return persons, locs, orgs, gpes, percents, times, dates, moneys, facilitys
def named_entities(sentence):
    # print(sentence)
    sentences = nltk.sent_tokenize(sentence)
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []

    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    st = StanfordNERTagger(
        '/Users/Parth/Desktop/project_docs/codes/stanford-ner-2016-10-31/classifiers/english.muc.7class.distsim.crf.ser.gz',
        '/Users/Parth/Desktop/project_docs/codes/stanford-ner-2016-10-31/stanford-ner.jar',
        encoding='utf-8')

    tokenized_text = word_tokenize(sentence)
    classified_text = st.tag(tokenized_text)

    entities_list = []
    for entity in entity_names:
        entities_list += entity.split()

    for tuple in classified_text:
        if tuple[1] != 'O':
            entities_list.append(tuple[0])

    return list(set(entities_list))
    '''
Exemple #6
0
def parts_of_speech(corpus):
    "returns named entity chunks in a given text"
    sentences = nltk.sent_tokenize(corpus)
    tokenized = [nltk.word_tokenize(sentence) for sentence in sentences]
    pos_tags = [nltk.pos_tag(sentence) for sentence in tokenized]
    chunked_sents = nltk.ne_chunk_sents(pos_tags, binary=True)
    return chunked_sents
Exemple #7
0
def findrelations(text):
    roles = """
    (.*(                   
    computer scientist|
    led |
    adjunct professor).*)|
    co-founder|
    chairman|
    parents|
    ,\sof\sthe?\s*  # "X, of (the) Y"
    """
    ROLES = re.compile(roles, re.VERBOSE)

    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences)

    for doc in chunked_sentences:
        print(doc)
        for rel in relextract.extract_rels('PER',
                                           'ORG',
                                           doc,
                                           corpus='ace',
                                           pattern=ROLES):
            #it is a tree, so you need to work on it to output what you want
            print(relextract.show_raw_rtuple(rel))
Exemple #8
0
def ne_extract(book, _num=''):
    print("NE Extraction: " + str(_num))
    # Create one big string from the whole book
    text = " ".join(list(itertools.chain.from_iterable(list(book.values())[1:-1])))
    # Split the sentences
    sentences = nltk.sent_tokenize(text)

    # Tokenize and tag the sentences...
    tokenizedSentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    taggedSentences = [nltk.pos_tag(sentence) for sentence in tokenizedSentences]
    # ...and then create an nltk.tree.Tree from the sentences
    chunkedSentences = nltk.ne_chunk_sents(taggedSentences, binary=True)

    # Create a list of all the named entities
    entityNames = buildDict(chunkedSentences)
    entityNames = removeStopwords(entityNames)          # Remove the stop words
    majorCharacters = getMajorCharacters(entityNames)   # And get occurences > 10

    # Split the whole text in sentences using RegEx
    sentenceList = splitIntoSentences(text)

    # Compare the list of characters with each sentence
    # returns a dict of all sentences for each character
    characterSentences = compareLists(sentenceList, majorCharacters)

    return characterSentences 
Exemple #9
0
def extract_named_entities(text_blocks):
    """
    Return a list of named entities extracted from provided text blocks (list of text strings).
    """
    sentences = []
    for text in text_blocks:
        sentences.extend(nltk.sent_tokenize(text))

    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label'):
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    return set(entity_names)
def trazer_entidades_nomeadas_v(texto):
    """ Obtem as entidades nomeadas de uma string"""
    sentences = nltk.sent_tokenize(texto)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)
    entity_names = []
    for tree in chunked_sentences:
        # Print results per sentence
        # print extract_entity_names(tree)
        entity_names.extend(extract_entity_names(tree))
    '''
    nomes_completos = []
    for i in range(0, len(entity_names)):
        if i < (len(entity_names) - 1):
            name = entity_names[i] + " " + entity_names[i + 1]
            nomes_completos.append(name)
    entity_names = entity_names + nomes_completos
    for nome in entity_names:
        print(nome)
    '''
    return entity_names


# Print all entity names
#print (entity_names)

# Print unique entity names
#print (TrazerEntidadesNomeadas())
Exemple #11
0
def chunkIntoEntities(text):
    entities = []
    sentences = sentenceTokenization(text)
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    for idx, tree in enumerate(chunked_sentences):
        entity_names = extract_entity_names(tree)
        entities.extend(entity_names)

    chunked_content = splitContentbyDelimiter(text, entities)
    return [chunked_content, entities]
Exemple #12
0
def extract_entity_question(question):

    sample = question
    sentences = nltk.sent_tokenize(sample)  #split in to sentences
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]  #split in to words
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]  #tag sentences with NN, NNP, etc
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        # Print result tree
        # print tree
        # Print results per sentence
        # print extract_entity_names(tree)

        entity_names.extend(extract_entity_names(tree))

    # Print all entity names
    # print entity_names

    # Remove incorrect entity "which"
    if 'Which' in entity_names:
        entity_names.remove('Which')
    if 'which' in entity_names:
        entity_names.remove('Which')

    # Print unique entity names
    # print set(entity_names)
    return entity_names
def getEntities(filename):
    with open('harry.txt', 'r') as f:
        sample = f.read()
    sample = sample.decode('unicode_escape').encode('ascii','ignore')
    print "sentence tokenize..."
    sentences = nltk.sent_tokenize(sample)
    print len(sentences)
    sentences = sentences[:len(sentences)/30]
    print len(sentences)
    print "word tokenize..."
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    print "POS tagging..."
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    print "Chunking..."
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []
    print "getting entities..."
    print "total sentences = ", len(chunked_sentences)
    for i, tree in enumerate(chunked_sentences):
        if i%100==0:
            print "on sentence", i
        entity_names.extend(extract_entity_names(tree))
    uniques = list(set(entity_names))
    #only returned named entities that are 2 words or more
    output = [u for u in unique if len(u.split(" ")) >= 2]
def generic_named_entities(file_path, seed_entities, model_name):
    """
    Obtains the generic entities from the sentences provided. This is because for the expansion strategies
    we only consider terms terms which are likely to be named entities by using NLTK entity detection, instead
    of all the words in the sentences.
    :param file_path:
    :return:
    """
    #unlabelled_sentence_file = open(file_path, 'r', encoding='utf-8')
    #text = unlabelled_sentence_file.read()
    print('Started to extract generic named entity from sentences...')
    #sentences = nltk.sent_tokenize(text)
    #tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    #tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    tag_reader = TagReader(file_path)
    sentence_reader = SentenceReader(file_path)

    def append(pair):
        entity_name = pair[1]
        if entity_name.lower() not in entity_names\
                and not wordnet.synsets(entity_name)\
                and entity_name.lower() not in stopwords.words('english')\
                and model_name not in entity_name.lower():
            entity_names.append(entity_name.lower())
            entity_sentence_pairs.append(pair)

    def extract_entity_word(t, sentence):
        """
        Recursively goes through the branches of the NLTK tagged sentences to extract the words tagged as entities
        :param t: NLTK tagged tree
        :return entity_names: a list of unique entity tokens
        """
        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                val = ' '.join([child[0] for child in t])
                append((sentence, val))
            else:
                for child in t:
                    extract_entity_word(child, sentence)

    chunked_sentences = nltk.ne_chunk_sents(tag_reader, binary=True)
    entity_names = []
    entity_sentence_pairs = []

    x = 0
    for elem in zip(chunked_sentences, sentence_reader):
        for seed_entity in seed_entities:
            if seed_entity in elem[1]:
                append((elem[1], seed_entity))

        extract_entity_word(*elem)

        x += 1
        if x % 1000 == 0:
            print('.', end='')
            sys.stdout.flush()

    print('Finished processing sentences with', len(entity_sentence_pairs),
          'new possible entities')
    return entity_sentence_pairs
Exemple #15
0
def remove_named_entities(content):
    if isinstance(content, float):
        return ''

    sentences = nltk.sent_tokenize(content)
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    entity_names = list(reversed(sorted(entity_names, key=len)))

    for ne in entity_names:
        content = content.replace(ne, 'NAMEDENTITY')

    return content
Exemple #16
0
def getNamedEntities(tweetlist):

    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in tweetlist
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    entity_names = []
    for tree in chunked_sentences:
        # Print results per sentence
        # print extract_entity_names(tree)
        entity_names.extend(extract_entity_names(tree))

    # return unique entity names
    return set(entity_names)
Exemple #17
0
def computeFeatures(post):
    numHTMLLinks = post.count("urlLink")
    post = post.replace('urlLink', '')
    sentences = nltk.sent_tokenize(post)
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    avg_sentlen = float(sum([len(word_tokenize(a))
                             for a in sentences])) / len(sentences)
    numUniqueWords = len(set(word_tokenize(post)))
    named_entities = []
    for chunk in chunked_sentences:
        entities = sorted(
            list(
                set([
                    word for tree in chunk
                    for word in extract_entity_names(tree)
                ])))
        for e in entities:
            if e not in named_entities:
                named_entities.append(e)
    print(named_entities)
    return numHTMLLinks, avg_sentlen, len(named_entities), numUniqueWords
Exemple #18
0
def extract_named_entities(text_blocks):
    """
    Return a list of named entities extracted from provided text blocks (list of text strings).
    """
    sentences = []
    for text in text_blocks:
        sentences.extend(nltk.sent_tokenize(text))

    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label'):
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    return set(entity_names)
Exemple #19
0
def get_referenced_place_names(text, *, use_countries=True, use_cities=True):
    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    sentances = nltk.sent_tokenize(text)

    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentances
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entities = []
    for tree in chunked_sentences:
        entities.extend(extract_entity_names(tree))

    places = set()
    for entity in entities:
        if (use_countries and entity.lower()
                in countries) or (use_cities and entity.lower() in cities):
            places.add(entity.title())

    return places
def return_entity_list(full_text):
    entities = []

    for title in full_text:
        sentences = nltk.sent_tokenize(title)
        tokenized_sentences = [
            nltk.word_tokenize(sentence) for sentence in sentences
        ]
        tagged_sentences = [
            nltk.pos_tag(sentence) for sentence in tokenized_sentences
        ]
        chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

        for tree in chunked_sentences:
            entities.extend(extract_entity_names(tree))
            extract_entity_names(tree)

    entities = list(set(entities))

    entities_list_len2 = []

    for entity in entities:
        if len(entity.split()) > 1 and entity not in entities_list_len2:
            entities_list_len2.append(entity)

    return entities, entities_list_len2
def generic_named_entities(file_path):
    """
    Obtains the generic entities from the sentences provided. This is because for the expansion strategies
    we only consider terms terms which are likely to be named entities by using NLTK entity detection, instead
    of all the words in the sentences.
    :param file_path:
    :return:
    """
    unlabelled_sentence_file = open(file_path, 'r', encoding='utf-8')
    text = unlabelled_sentence_file.read()
    print('Started to extract generic named entity from sentences...')
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []
    x = 0
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_word(tree))
        x += 1
        if x % 1000 == 0:
            print('.', end='')
    print('Finished processing sentences with', len(entity_names),
          'new possible entities')
    return entity_names
Exemple #22
0
def extract_EN(tag_sents):
    # Las frases chunked devuelven la estructura de cada frase en forma de árboles.
    chunked_sentences = nltk.ne_chunk_sents(tag_sents, binary=True)

    # Función recursiva que recorre el arbol.

    def extract_entity_names(t):
        entity_names = []
        # Se comprueba que el token tenga etiqueta.
        if hasattr(t, 'label') and t.label:
            # t.label = <bound method Tree.label of Tree('S', [Tree('GPE ........
            # Si es un entity name entonces lo agregamos con los que ya hemos identificado.
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))

            # En caso contrario obtenemos todos los hijos del token para continuar con la búsqueda.
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))
        return entity_names

    # Inicializamos el resultado.
    entity_names = []

    # Recorremos cada árbol correspondiente a cada frase.
    for tree in chunked_sentences:
        #print('Arbol: ', tree)
        entity_names.extend(extract_entity_names(tree))

    # Devolvemos
    return entity_names
Exemple #23
0
def main():
    """Nltk default Named Entity Extractor"""

    with open(
            os.getcwd() +
            "\\Natural language processing\\articles\\sampleArticle3.txt",
            'r') as file:
        sample = file.read()

    # sentence segmenter
    sentences = nltk.sent_tokenize(sample)

    # word tokenizer
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]

    # part-of-speech tagger
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]

    # named entities extraction
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences)

    entity_names = []
    for tree in chunked_sentences:

        # print(tree)
        entity_names.extend(extract_entity_names(tree))

    # Print unique entity names
    print(set(entity_names))
Exemple #24
0
def missing_entities(raw, current_entity_names):
    entity_names = []
    final_entities = []
    final_entities_ne = []
    final_entities_pos = []
    sentences = nltk.sent_tokenize(raw)
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)
    for sentence in tagged_sentences:
        for word in sentence:
            if word[1] == 'CD':
                # If part of speech is "Cardinal Digit" (A number)
                # Add to label "TIME"
                final_entities.append(word[0])
                final_entities_pos.append(word[1])
                final_entities_ne.append('TIME')
    for tree in chunked_sentences:
        for child in tree:
            if hasattr(child, 'label') and child.label:
                if any(c in child.label() for c in (NE_GROUPS)):
                    if child[0][0] not in current_entity_names:
                        # If the word is not in training set
                        entity_names.append(child)

    for child in entity_names:
        final_entities_ne.append(child.label())
        final_entities.append(child[0][0])
        final_entities_pos.append(child[0][1])

    return final_entities, final_entities_pos, final_entities_ne
def getDateandCountry(
    pos
):  #to extract the date periods and the country names from the provenance section of each painting
    grammar1 = "DATEr: {<CD><:><CD>}"  #grammar for daterange
    grammar2 = "DATE: {<CD>}"  #grammar for date
    cp1 = nltk.RegexpParser(grammar1)
    cp2 = nltk.RegexpParser(grammar2)
    result1 = [cp1.parse(sentence) for sentence in pos]
    result2 = [cp2.parse(sentence) for sentence in pos]
    #print(result)
    dateranges = [extract(r1, "DATEr") for r1 in result1]
    dates = [extract(r2, "DATE") for r2 in result2]
    for d in dates:  #keeping only 4 digit numbers as years
        if len(d) != 4:
            dates.remove(d)
    d = collections.OrderedDict.fromkeys(sum(
        dates, []))  #maintaining order or provenance
    dr = collections.OrderedDict.fromkeys(sum(dateranges, []))
    d = z = collections.OrderedDict.fromkeys(
        [re.sub('[^0-9]+', '', date) for date in d])
    chunks = nltk.ne_chunk_sents(pos)
    #print(list(chunks))
    gpe = [extract(c, "GPE") for c in chunks
           ]  #extracting country/city geo political entity names from chunks
    #print(primechunks)
    countries = collections.OrderedDict.fromkeys(sum(
        gpe,
        []))  #maintaining order of country names to combine with date ranges
    #print(countries)
    #return mergedchunks
    return dr.keys(), d.keys(), countries
Exemple #26
0
def chunkIntoEntities( text ):
    entities = []
    sentences = sentenceTokenization(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    
    def extract_entity_names(t):
        entity_names = []
    
        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))
    
        return entity_names
    
    for idx,tree in enumerate(chunked_sentences):
        entity_names = extract_entity_names(tree)
        entities.extend(entity_names)
    
    chunked_content = splitContentbyDelimiter(text, entities)
    return [chunked_content, entities]
def extract_ne_en(text):
    """text: utf-8
       refer to: http://www.nltk.org/book/ch07.html#ex-ie4
                 http://www.cnblogs.com/webRobot/p/6080155.html
       import nltk
       nltk.download() # punkt averaged_perceptron_tagger maxent_ne_chunker
       tokenizers/punkt/english.pickle
       taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle
       chunkers/maxent_ne_chunker/english_ace_binary.pickle
       corpora/words
    """
    if isinstance(text, str):
        text = text.decode("utf-8")
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    ct = Counter(entity_names)
    results = ct.most_common()

    return [r[0] for r in results]
def identify_entities():
    f = headlines_filtered.titles.tolist()
    matches = []
    for line in tqdm(f, total=len(f), unit="headlines"):
        sentences = nltk.sent_tokenize(line)
        tokenized_sentences = [
            nltk.word_tokenize(sentence) for sentence in sentences
        ]
        tagged_sentences = [
            nltk.pos_tag(sentence) for sentence in tokenized_sentences
        ]
        chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

        entities = []
        for tree in chunked_sentences:
            entities.extend(extract_entity_names(tree))

        if len(entities) == 0:
            matches.append(np.nan)
        else:
            matches.append(", ".join(entities))

    series = pd.Series(matches)
    out = headlines_filtered.copy()
    out['entities'] = series.values
    out = out.dropna()
    out.to_csv("../data/headline_entity_id.csv")
Exemple #29
0
 def chunk(self, pos_tagged):
     """
     chunk POS tags into named entities
     :param pos_tagged:
     :return:
     """
     return nltk.ne_chunk_sents(pos_tagged)
 def parts_of_speech(self, corpus):
     "returns named entity chunks in a given text"
     sentences = nltk.sent_tokenize(corpus)  #Uso toknenizer para español
     tokenized = [nltk.word_tokenize(sentence) for sentence in sentences]
     pos_tags  = [nltk.pos_tag(sentence) for sentence in tokenized]
     chunked_sents = nltk.ne_chunk_sents(pos_tags, binary=True)
     return chunked_sents
Exemple #31
0
def extract_entities_nltk(sample):
    import nltk
    import operator
    sentences = nltk.sent_tokenize(sample)
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    #create a map with entity,count count representing
    # the number of occurrences of an entity
    entity_count = {}
    for entity in entity_names:
        if entity in entity_count:
            entity_count[entity] += 1
        else:
            entity_count[entity] = 1

    sorted_occurrences = sorted(entity_count.iteritems(),
                                reverse=True,
                                key=operator.itemgetter(1))
    return sorted_occurrences
def extractKeywords(data):
    array = []
    logging.warning('NLTK processing starts:')
    logging.warning(data)
    for i, item in enumerate(data):
        sample = data[i]
        sentences = nltk.sent_tokenize(sample)
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

        def extract_entity_names(t):
            entity_names = []
            if hasattr(t, 'label') and t.label:
                if t.label() == 'NE':
                    entity_names.append(' '.join([child[0].lower() for child in t]))
                else:
                    for child in t:
                        entity_names.extend(extract_entity_names(child))
            return entity_names

        entity_names = []
        for tree in chunked_sentences:
            entity_names.extend(extract_entity_names(tree))
        for item in entity_names:
            if item not in stops:
                array.append(item)
    logging.warning('NLTK processing finished:')
    logging.warning(array)
    return array
def extract_entities(sample):
    """
    Returns a set of proposed entities
    """
    sentences = nltk.sent_tokenize(sample)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, 'label') and t.label:
            if t.label() == 'NE':
                entity_names.append(' '.join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    entity_names = []
    for tree in chunked_sentences:
        # Print results per sentence
        # print extract_entity_names(tree)

        entity_names.extend(extract_entity_names(tree))

    # Print unique entity names
    return set(entity_names)
def comment_to_chunked_wordlist(line):
    text = re.sub("[^a-zA-Z]", " ", line)
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [
        nltk.word_tokenize(sentence) for sentence in sentences
    ]
    tagged_sentences = [
        nltk.pos_tag(sentence) for sentence in tokenized_sentences
    ]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        for chunk in tree:
            if type(chunk) == nltk.Tree:
                entity_names.append(' '.join(c[0] for c in chunk.leaves()))
            else:
                entity_names.append(chunk[0])
        entity_names = [
            word.strip(string.punctuation).lower() for word in entity_names
            if len(word.strip(string.punctuation)) > 1
        ]

    words = [w for w in entity_names if not w in stops]
    return words
Exemple #35
0
def extract_person_names(text):
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [pos_tagger.tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences)

    return set(_flat_map(extract_person_names_from_tree(tree)
                         for tree in chunked_sentences))
def ie_process(document):
    "returns named entity chunks in a given text"
    sentences = nltk.sent_tokenize(document)
    tokenized = [nltk.word_tokenize(sentence.translate(string.punctuation)) for sentence in sentences]
    pos_tags  = [nltk.pos_tag(sentence) for sentence in tokenized]
    #print(pos_tags)
    chunked_sents = nltk.ne_chunk_sents(pos_tags, binary=True)
    return chunked_sents
def chunk_sentences(sentences):

    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

    chunked_sentences = nltk.ne_chunk_sents(sentences, binary=True)

    return chunked_sentences
def findrelations(text):
    sentences = nltk.sent_tokenize(text)# разбить текст на предложения
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] # разбить предложения на токены
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]# морфологический разбор
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences)#выделение именованных сущностей

    for doc in chunked_sentences:
        doc.draw()
Exemple #39
0
def get_top_NEs(tagged_sentences, n=TOP_NERs):
    """ Return the n longest named entities of a text """
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    return sorted(entity_names, key=len, reverse=True)[:n]
Exemple #40
0
def chunked_sentences(text):
    """Splits a large string into chunked sentences [http://www.nltk.org/book/ch07.html#chunking]
    """
    import nltk
    sentences = split_sentences(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    return chunked_sentences
def name_rec1(sample):
    sentences = nltk.sent_tokenize(sample)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    return entity_names
def extract_named_entities(text):
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    return list(set(entity_names))
def analyse_hansard_file(filename='House of Representatives_2018_05_10_6091.xml'):
    # Word frequency analysis
    my_abbrev = ['\'m', '.', ',', '\'s', '(', ')', 'n\'t', '\'ve', ';', '$', ':', '\'', '?', '\'ll', '\'re']
    stoplist = set(stopwords.words('english') + my_abbrev)
    soup, sample = parse_hansard(filename)

    # Tokenisation, tagging, chunking
    sent_tokenizer = PunktSentenceTokenizer()
    # Stop breaking sentence at "No."
    sent_tokenizer._params.abbrev_types.add('no')
    #sentences = nltk.sent_tokenize(sample)
    # TODO: improve sentence tokenizer - still far from good
    sentences = sent_tokenizer.tokenize(sample)

    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    # Word frequency over all sentences
    tokens = []
    for sentence in tokenized_sentences:
        tokens += [word for word in sentence if word.lower() not in stoplist]
    display_freq(tokens)

    # Part-of-speech analysis
    tags = []
    for sentence in tagged_sentences:
        tags += sentence
    pos_analysis(tags, my_abbrev)

    # spaCy NER
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sample)
    # Find named entities, phrases and concepts
    ne_spacy = {}
    for entity in doc.ents:
        if entity.label_ in ne_spacy:
            ne_spacy[entity.label_] += [entity.text]
        else:
            ne_spacy[entity.label_] = [entity.text]
    logger.debug("Entity number per type: %s" % {k:len(v) for k,v in ne_spacy.items()})
    for k in ne_spacy.keys():
        display_freq(ne_spacy[k], 'Named entities (%s)' % (k,), top=20)

    # Interjection analysis
    parties = {}
    all_interjections = soup.find_all('interjection')
    for interjection in all_interjections:
        # Can be either a party or a role (Speaker, President, etc, ...)
        party = interjection.party.text or interjection.find('name', role='metadata').text
        if party in parties:
            parties[party] = parties[party] + 1
        else:
            parties[party] = 1
    logger.debug("%s interjections: %s" % (len(all_interjections), parties))
Exemple #44
0
 def get_ner_nltk(self, text):
   sents = nltk.sent_tokenize(text)  # sentences
   tokenized_sents = [nltk.word_tokenize(s) for s in sents]
   tagged_sents = [nltk.pos_tag(s) for s in tokenized_sents]
   chunked_sents = [x for x in nltk.ne_chunk_sents(tagged_sents)]
   raw = self.traverseTree(chunked_sents)
   ners = {}
   for n in self.entity_cols: ners[n] = []
   [ners[k].append(v.lower()) for k,v in raw]
   for n in self.entity_cols: ners[n] = list(set(ners[n]))
   return ners
Exemple #45
0
def get_entities3(text):
  sentences = nltk.sent_tokenize(text)
  tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
  tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
  #chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
  chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
  
  entity_names=[]
  for tree in chunked_sentences:
    entity_names.extend(extract_entity_names(tree))

  return filter_entities(entity_names)
Exemple #46
0
 def nominated_entities(self):
     
     sentences = nltk.sent_tokenize(self.article)
     tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
     chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
     
     entity_names = []
     for chunked_sentence in chunked_sentences:
         entity_names.extend(self._extract_entity_names(chunked_sentence))
     
     return list(set(entity_names))
Exemple #47
0
 def initialize(self, sample):
     sentences = nltk.sent_tokenize(sample)
     tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
     chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
     entity_names = []
     for tree in chunked_sentences:
         # Print results per sentence
         # print _extract_entity_names(tree)
         
         entity_names.extend(self._extract_entity_names(tree))
     return entity_names
def cfst(text_raw, text_parser_a):
	# TOKENIZATION: split into sentences
	text_sentences = nltk.sent_tokenize(text_raw)
	# TOKENIZATION: split into words
	text_words = [nltk.word_tokenize(sent) for sent in text_sentences]
	# TOKENIZATION: tag the words' part of speach
	text_posed = [nltk.pos_tag(sent) for sent in text_words]
	# BASIC-GROUP HANDLING: chunk the words at named entities
	text_chunked = nltk.ne_chunk_sents(text_posed)#[nltk.chunk.ne_chunk(sent) for sent in text_cleaned]
	# COMPLEX-PHRASE HANDLING: chunk the words
	text_chunked = [text_parser_a.parse(sent) for sent in text_chunked]
	# unwind the tree only keeping interesting parts
	text_done = []
	for sent in text_chunked:
		for x in xrange(0, len(sent)):
			tmp_string = str(sent[x])
			if tmp_string.startswith('(NP') or tmp_string.startswith('(VP'):
				text_frag = []
				for y in xrange(0, len(sent[x])):
					word = sent[x][y]
					if len(word) == 2:
						if len(word[0][0]) > 1:
							for sub_word in word:
								if len(sub_word[0]) > 0:
									text_frag.append(str(sub_word[0]).lower())
						else:
							if len(word[0]) > 0:
								text_frag.append(str(word[0]).lower())
					elif len(word) > 2:
						for sub_word in word:
							if len(sub_word[0]) > 0:
								text_frag.append(str(sub_word[0]).lower())
				if (len(text_frag) > 0):
					text_done.append(' '.join(text_frag))
	
	# remove duplicate phrases
	text_dictionary = {}
	for word in text_done:
		text_dictionary[word] = 1
	#return ' '.join(text_dictionary)
	
	# concatenate phrases
	phrase_dictionary = ' '.join(text_dictionary)
	
	# remove duplicate words from the concatenated phrases
	phrase_words = phrase_dictionary.split(' ')
	search_words = {}
	for word in phrase_words:
		search_words[word] = 1
	
	# return the unique words
	return ' '.join(search_words)
def prepareSentence(sample):
	sentences = nltk.sent_tokenize(sample)
	tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
	tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
	#chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
	chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
	chunked_sentences_list = []
	try:
		chunked_sentences_list.append(next(chunked_sentences))
	except StopIteration:
		pass

	# print chunked_sentences_list
	return chunked_sentences_list
	def char_recognition(self, char_number = 20):
		tagged_sentences = nltk.pos_tag_sents(self.tokenized_sentences)
		self.entities = []
		entity_names = []
		if nltk.__version__[0] == '3':
			chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)
			for tree in chunked_sentences:
				entity_names.extend(extract_entity_names3(tree))
		else:
			chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=False)
			for tree in chunked_sentences:
				entity_names.extend(extract_entity_names(tree))
		count = Counter([name for name in entity_names])
		for c in count.most_common(char_number):
			self.entities.append(c[0])
def chunkSentences(text):
    """
    Parses text into parts of speech tagged with parts of speech labels.

    Used for reference: https://gist.github.com/onyxfish/322906
    """
    sentences = nltk.sent_tokenize(text)
    tokenizedSentences = [nltk.word_tokenize(sentence)
                          for sentence in sentences]
    taggedSentences = [nltk.pos_tag(sentence)
                       for sentence in tokenizedSentences]
    if nltk.__version__[0:2] == "2.":
        chunkedSentences = nltk.batch_ne_chunk(taggedSentences, binary=True)
    else:
        chunkedSentences = nltk.ne_chunk_sents(taggedSentences, binary=True)
    return chunkedSentences
def computeFeatures(post):
	numHTMLLinks = post.count("urlLink")
	post = post.replace('urlLink','')
	sentences = nltk.sent_tokenize(post)
	tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
	tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
	chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
	avg_sentlen = float(sum([len(word_tokenize(a)) for a in sentences]))/len(sentences)  
	numUniqueWords = len(set(word_tokenize(post)))
	named_entities = []
	for chunk in chunked_sentences:
		entities = sorted(list(set([word for tree in chunk for word in extract_entity_names(tree)])))
		for e in entities:
			if e not in named_entities:
				named_entities.append(e)
	print(named_entities)
	return numHTMLLinks,avg_sentlen,len(named_entities),numUniqueWords
def preprocess_for_nee_and_print(argument):                    	

	sentences = nltk.sent_tokenize(argument)
	tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
	tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
	chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

	entity_names = []

	for tree in chunked_sentences:
		#tree.draw()
		entity_names.extend(extract_entity_names(tree))

	#print entity_names
	named_entities = [w for w in entity_names if w not in not_NE]

	return named_entities
def entity_list(text):
    """Extract a list of entities from a piece of text"""
    try:
        text = text.decode()
    except AttributeError:
        pass
    # Parse the text into tagged and chunked sentences
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    # return unique entities
    return set(entity_names)
 def extract(self, text):
     sentences = nltk.sent_tokenize(text)
     tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
     chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
     
     entity_names = []
     for t in chunked_sentences:
         if hasattr(t, 'node') and t.node:
             if t.node == 'NE':
                 entity_names.append(' '.join([child[0] for child in t]))
             else:
                 for child in t:
                     entity_names.extend(tree_traverse(child))
             
     return entity_names
     
     
def comment_to_chunked_wordlist(line):    
    text = re.sub("[^a-zA-Z]", " ", line )
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
    
    entity_names = []
    for tree in chunked_sentences:  
        for chunk in tree:
            if type(chunk) == nltk.Tree:
                entity_names.append(' '.join(c[0] for c in chunk.leaves()))
            else:
                entity_names.append(chunk[0])
        entity_names = [word.strip(string.punctuation).lower() for word in entity_names if len(word.strip(string.punctuation)) > 1]
    
    words = [w for w in entity_names if not w in stops]        
    return words
def extract_entities(document):
    # uses NLTK to extract out the Noun Phrases and other meaningful phrases.

    """

    :param document:
    :return:
    """
    sentences = sent_tokenize(document)
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [tagger.tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(_extract_entity_names(tree))

    return set(entity_names)