Beispiel #1
0
def extract_entities_stanford(text):
    ne_tagger = nltk.StanfordNERTagger("E:/Martin/Škola/KnowledgeBase/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz","E:/Martin/Škola/KnowledgeBase/stanford-ner-2018-02-27/stanford-ner.jar")
    chunked = ne_tagger.tag(word_tokenize(text))
    current_chunk = []
    result = []
    ent_type = ""
    for i in chunked:
        if i[1] != 'O':
            if i[1] == ent_type or not current_chunk:
                current_chunk.append(i[0])
            else:
                result = create_ne(current_chunk, result)
                current_chunk = []
            ent_type = i[1]
        elif current_chunk:
            result = create_ne(current_chunk, result)
            current_chunk = []
    if current_chunk:
        result = create_ne(current_chunk, result)
    return result
Beispiel #2
0
def extract_entities_per_sentence(text, lib='NLTK'):
    assert lib in ('NLTK', 'StanfordNER'), "The valye of 'lib' must either be 'NLTK' or 'StanfordNER'."

    entities = []

    if lib == 'NLTK':
        for sent in nltk.sent_tokenize(text):
            entities_per_sentence = collections.defaultdict(int)
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label'):
                    entities_per_sentence[(chunk.label(), ' '.join(c[0] for c in chunk))] += 1
            entities.append(entities_per_sentence)

    elif lib == 'StanfordNER':
        config = yaml.load(open('config.yaml'))
        stanford_ner_location = config.get('defaults', {}).get('depend', {}).get('stanford-ner')

        assert stanford_ner_location, "Please provide the location to the StanfordNER 2015-12-09 directory in the" \
                                      " defaults->depend->stanford-ner configuration section."

        tokenized_sents = []
        for sent in nltk.sent_tokenize(text):
            tokenized_sents.append([
                token.replace('/', '-')
                for token in nltk.word_tokenize(sent)])

        stanford_tagger = nltk.StanfordNERTagger(
            model_filename=os.path.join(stanford_ner_location, 'classifiers/english.all.3class.distsim.crf.ser.gz'),
            path_to_jar=os.path.join(stanford_ner_location, 'stanford-ner-3.6.0.jar'),
            encoding='utf-8')

        ne_tagged_sentences = stanford_tagger.tag_sents(tokenized_sents)
        for ne_tagged_sentence in ne_tagged_sentences:
            entities_per_sentence = collections.defaultdict(int)
            for tag, chunk in itertools.groupby(ne_tagged_sentence, lambda x: x[1]):
                if tag != 'O':
                    entities_per_sentence[(tag, ' '.join(w for w, t in chunk))] += 1
            entities.append(entities_per_sentence)

    return entities
Beispiel #3
0
import nltk
import ner
import nltk.data
from nltk.corpus.reader import path_similarity, Synset
from nltk.stem import WordNetLemmatizer
import language_check
from nltk.corpus import wordnet as wn
import collections

from langdetect import detect

st = nltk.StanfordNERTagger(
    "nlp/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz",
    "nlp/stanford-ner-2015-12-09/stanford-ner.jar")

tool = language_check.LanguageTool('en-US')
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
wordnet_lemmatizer = WordNetLemmatizer()


def check_grammar(text):
    matches = tool.check(text)
    len(matches)
    text = language_check.correct(text, matches)
    return text


def get_sentences(text):
    sentenced_text = sent_detector.tokenize(text.strip())
    return sentenced_text
Beispiel #4
0
    target = "targetTitle"  # "postText" or "targetTitle"
    prefix = "PT" if target == "postText" else "TA"
    NORMALIZE = False

    FEATURES_DATA_PATH = r"../features/pos_features_{}_{}_{}.csv".format(
        DATASET, target, 'normalized' if NORMALIZE else "no-normalized")
    print(
        f"Generating POS features... it might take a while :P\n Path: '{FEATURES_DATA_PATH}' | {target} | {prefix}"
    )

    labeled_instances = get_labeled_instances(
        "../train_set/instances_converted_{}.pickle".format(DATASET),
        "../train_set/truth_converted_{}.pickle".format(DATASET))

    tagger = nltk.StanfordNERTagger(
        '../ner/english.all.3class.distsim.crf.ser.gz',
        '../ner/stanford-ner.jar',
        encoding='utf-8')

    tagset = nltk.load("help/tagsets/upenn_tagset.pickle")
    possible_tags = list(tagset.keys())

    ids = list(labeled_instances.id)
    if target == 'postText':
        texts = [txt[0] for txt in list(labeled_instances.postText)]
    else:
        texts = [txt for txt in list(labeled_instances.targetTitle)]
    features = []
    for idx, txt in enumerate(texts, 1):
        print(f"Computing features for sample {idx} out of {len(texts)}...")
        features.append(
            generate_pos_features(txt,
Beispiel #5
0
def nerTagger(tokens):
    chi_tagger = nltk.StanfordNERTagger(
        model_filename=r'E:\03_tools\machine learning\stanfordnlp\3.7\stanford-chinese-corenlp-2016-10-31-models\edu\stanford\nlp\models\ner\chinese.misc.distsim.crf.ser.gz',
        path_to_jar=r'E:\03_tools\machine learning\stanfordnlp\3.7\stanford-ner-2016-10-31\stanford-ner.jar')
    for word, tag in chi_tagger.tag(tokens):
        print(word, tag)
Beispiel #6
0
def search_for_location(line):
    """
    Verifies if passed in chunks are names of cities/locations
    Assumptions made:
    If the statement is about weather, then all named entities are treated
    as a location (this includes ORGANIZATION, PERSON tags)
    This function utilizes a combination of nltk's built in pos_tag() function and
    the Stanford NER Tagger. The function will choose the option that gives a longer
    location string.
    @param line: the text to search through
    @return: a string of the found location, if none found, returns None
    """
    ner_tagger = nltk.StanfordNERTagger(
        "%s/dep/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz"
        % (os.getcwd()))

    # tags to identify for nltk
    loc_labels = ["GPE", "ORGANIZATION", "PERSON"]

    tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(line)))
    # tree.draw()
    tags = ner_tagger.tag(line.split())
    # print(tags)

    # figure out the location from the Stanford tagger
    ner_location = ""
    for tag in tags:
        if tag[1] == "LOCATION":
            ner_location += (tag[0] + " ")

    ner_location = ner_location.strip()
    ner_location = ner_location.strip("?!., ")
    print("ner loc: %s" % (ner_location))

    # figure out the location from the nltk tagger
    location = ""

    # only the top level has the "S" label
    for subtree in tree.subtrees(lambda t: t.label() == "S"):
        for chunk in subtree.subtrees(lambda t: t.height() == 2):

            # some "chunks" are just strings apparently
            if isinstance(chunk, str):
                continue

            # each seperate detected word will be seperated with a comma
            if chunk.label() in loc_labels:
                location_elem = ""

                for word, pos in chunk:
                    location_elem += word + " "

                location_elem = location_elem.strip()

                location += location_elem + ", "

    location = location.strip()
    location = location.strip(" ,")
    print("nltk loc: %s" % (location))
    '''
    if location != "":
        print("found location %s"%(location))
    else:
        print("No location found")
    '''

    return location if len(location) > len(ner_location) else ner_location