def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
    stanford_tagger = StanfordNERTagger(
        model_name,
        stanford_ner_path,
        encoding='utf-8')

    text = sent_obj.text
    tokenized_text = list()
    spans = list()

    #Recover spans here
    for match in re.finditer("\S+", text):
        start = match.start()
        end = match.end()
        word = match.group(0)
        tokenized_text.append(word.rstrip(",.;:)("))
        spans.append((start,end))
    tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
    classified_text = stanford_tagger.tag(tokenized_text)


    # Expand tuple to have span as well
    len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset
    final_class_and_span = list()
    for idx,tup in enumerate(classified_text):
        combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1])
        final_class_and_span.append(combined)

    # print(classified_text)
    sent_obj.sentence_attribs.extend(get_attributes(final_class_and_span))
    return sent_obj
Esempio n. 2
0
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
    stanford_tagger = StanfordNERTagger(model_name,
                                        stanford_ner_path,
                                        encoding='utf-8')

    text = sent_obj.text
    tokenized_text = list()
    spans = list()

    #Recover spans here
    for match in re.finditer("\S+", text):
        start = match.start()
        end = match.end()
        word = match.group(0)
        tokenized_text.append(word.rstrip(",.;:)("))
        spans.append((start, end))
    tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
    classified_text = stanford_tagger.tag(tokenized_text)

    # Expand tuple to have span as well
    len_diff = len(spans) - len(
        classified_text
    )  #Headers were stripped, so if this occured in the previous step, we have t account for the offset
    final_class_and_span = list()
    for idx, tup in enumerate(classified_text):
        combined = (classified_text[idx][0], classified_text[idx][1],
                    spans[idx + len_diff][0], spans[idx + len_diff][1])
        final_class_and_span.append(combined)

    # print(classified_text)
    sent_obj.sentence_attribs.extend(get_attributes(final_class_and_span))
    return sent_obj
Esempio n. 3
0
def ner_tag(questions):
    path = 'C:\\Users\Martin\\PycharmProjects\\xserpy\\stanford-nlp\\'
    st_ner = StanfordNERTagger(path+'classifiers\\english.all.3class.distsim.crf.ser.gz', path+'stanford-ner.jar')
    java_path = "C:\\Program Files\\Java\\jdk1.8.0_65\\bin\\java.exe"
    os.environ['JAVAHOME'] = java_path
    tagged = []
    i = 0
    for q in questions:
        text = nltk.word_tokenize(q.utterance)
        tagged.append(st_ner.tag(text))
        i += 1
    return tagged
Esempio n. 4
0
class NERHandler(object):
    """
    handler class for the Stanford NER
    """
    def __init__(self):
        """
        constructor for the NERHandler class
        :return:
        """
        if sys.platform.startswith('win'):
            os.environ['CLASSPATH'] = os.path.dirname(
                __file__) + "\\stanford-ner-2015-12-09\\"
            os.environ['STANFORD_MODELS'] = os.path.dirname(
                __file__) + "\\stanford-ner-2015-12-09\\classifiers\\"
            os.environ['JAVAHOME'] = "C:\\Program Files\\Java\\jre1.8.0_91"
        else:
            os.environ['CLASSPATH'] = os.path.dirname(
                __file__) + "/stanford-ner-2015-12-09/"
            os.environ['STANFORD_MODELS'] = os.path.dirname(
                __file__) + "/stanford-ner-2015-12-09/classifiers/"

        self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')

    def tag(self, text):
        """
        search for Named locations within text
        :param text: String list containing text that needs to be searched
        :return: list of locations
        """
        text = '. '.join(text)
        tags = self.st.tag(text.split())
        # if there is tag 'LOCATION' add to locations, note locations can be multiple tags long
        i = 0
        locations = []
        while i < len(tags):
            location = []
            if tags[i][1] == "LOCATION":
                location.append(tags[i][0])
                i += 1
                while tags[i][1] == "LOCATION":
                    location.append(tags[i][0])
                    i += 1
                locations.append(' '.join(location))
            else:
                i += 1

        locations = list(set(locations))
        return locations
Esempio n. 5
0
def ner_tag(questions, path, java_path):
    """Tag each word in given set of questions with NER tag then return list of lists of tags

    Keyword arguments:
    questions -- list of Question objects
    path -- a path to Stanford NLP library
    java_path -- path to Java executable

    """

    sep = os.path.sep
    # Uses Stanford NER tagger with a dictionary
    st_ner = StanfordNERTagger(path+"classifiers" + sep + "english.all.3class.distsim.crf.ser.gz", path+"stanford-ner.jar")
    os.environ['JAVAHOME'] = java_path

    tagged = []
    for q in questions:
        text = nltk.word_tokenize(q.utterance)
        tagged.append(st_ner.tag(text))
    return tagged
Esempio n. 6
0
def ner_tag(questions, path, java_path):
    """Tag each word in given set of questions with NER tag then return list of lists of tags

    Keyword arguments:
    questions -- list of Question objects
    path -- a path to Stanford NLP library
    java_path -- path to Java executable

    """

    sep = os.path.sep
    # Uses Stanford NER tagger with a dictionary
    st_ner = StanfordNERTagger(
        path + "classifiers" + sep + "english.all.3class.distsim.crf.ser.gz",
        path + "stanford-ner.jar")
    os.environ['JAVAHOME'] = java_path

    tagged = []
    for q in questions:
        text = nltk.word_tokenize(q.utterance)
        tagged.append(st_ner.tag(text))
    return tagged
Esempio n. 7
0
def text2graph(text):
    from nltk import StanfordNERTagger, word_tokenize
    import os
    os.environ['JAVAHOME'] = r"C:\Program Files (x86)\Java\jre1.8.0_181\bin\java.exe"

    st = StanfordNERTagger(r'..\..\..\stanford-ner-2018-10-16\classifiers\english.all.3class.distsim.crf.ser.gz',
                           r'..\..\..\stanford-ner-2018-10-16\stanford-ner.jar',
                           encoding='utf-8')

    # merge objects into one
    classified_text = st.tag(word_tokenize(text))
    merged_classified_text = [classified_text[0]]
    full_word = []
    for i in range(1, len(classified_text)):
        prev_word, prev_class = classified_text[i - 1]
        current_word, current_class = classified_text[i]
        if current_class != prev_class or current_class == 'O':
            merged_classified_text.append((' '.join(full_word), prev_class))
            full_word = [current_word]
        else:
            full_word.append(current_word)

    # create dataframe of all edges in graph
    edges = []
    win_size = 20
    half_win_size = int(win_size / 2)
    for i in range(half_win_size, len(merged_classified_text) - half_win_size - 1):
        word, word_type = merged_classified_text[i]
        if word_type != 'PERSON':
            continue
        for neighbor, neighbor_type in merged_classified_text[i - half_win_size:i + half_win_size + 1]:
            if neighbor_type != 'PERSON':
                continue
            edges.append([word, neighbor, i])

    graph_df = pd.DataFrame(edges, columns=['from', 'to', 'time'])

    return nx.from_pandas_edgelist(graph_df, 'from', 'to', 'time', create_using=nx.MultiGraph())
Esempio n. 8
0
            if any([ll not in event.keys() for ll in lngp]):
                continue

            eventsentspr.append(
                [event[lngp[0]]['title'], event[lngp[1]]['title']])
            #lngpair.append(lngp)

    leftNE = []
    rightNE = []
    for pr in eventsentspr:
        timenow = time.time()
        NElist = []
        if lngp[0] == 'es':
            wordslist = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>? ]',
                                 pr[0])
            classified_text = stes.tag(wordslist)
            NElist = [w[0] for w in classified_text if w[1] not in ['o', 'O']]

        if lngp[0] == 'en':
            wordslist = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>? ]',
                                 pr[0])
            classified_text = sten.tag(wordslist)
            NElist = [w[0] for w in classified_text if w[1] not in ['o', 'O']]

        if lngp[0] == 'de':
            wordslist = re.split(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>? ]',
                                 pr[0])
            classified_text = stde.tag(wordslist)
            NElist = [w[0] for w in classified_text if w[1] not in ['o', 'O']]

        leftNE.append(list(set(NElist)))
Esempio n. 9
0
import nltk
import corenlp

nltk.download('punkt')
from nltk import StanfordNERTagger

# NER Using NLTK
st = StanfordNERTagger(
    '/home/abin/my_works/nlp/stanford-ner-4.0.0/ner-model.ser.gz',
    '/home/abin/my_works/nlp/stanford-ner-4.0.0/stanford-ner.jar',
    encoding='utf-8')
#
text = 'Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action.'
#
tokenized_text = nltk.word_tokenize(text)
classified_text = st.tag(tokenized_text)

print(classified_text)

# NER using stanford-corenlp library
# Make sure you have set $CORENLP_HOME as environment variable before start to use Stanford CoreNLPClient

with corenlp.CoreNLPClient(annotators="ner".split(), memory='2G') as client:
    ann = client.annotate(text)

print(ann)