def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
    stanford_tagger = StanfordNERTagger(
        model_name,
        stanford_ner_path,
        encoding='utf-8')

    text = sent_obj.sentence
    tokenized_text = list()
    spans = list()
    #Recover spans here
    for match in re.finditer("\S+", text):
        start = match.start()
        end = match.end()
        word = match.group(0)
        tokenized_text.append(word.rstrip(",.;:"))
        spans.append((start,end))
    tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
    classified_text = stanford_tagger.tag(tokenized_text)

    # Expand tuple to have span as well
    len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset
    final_class_and_span = list()
    for idx,tup in enumerate(classified_text):
        combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1])
        final_class_and_span.append(combined)

    #print(classified_text)
    sent_obj.tok_sent_with_crf_predicted_attribs[type] = final_class_and_span
    return sent_obj
Beispiel #2
0
def get_location(loc):
    """
    currently working only on my computer
    english Model
        english.muc.7class.distsim.crf.ser.gz
    german Models
        german.dewac_175m_600.crf.ser.gz
        german.hgc_175m_600.crf.ser.gz
    """
    # Named Entity Recognizer: recognizes named entities and assigns types like location, person, organization to the entity
    st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz',
    'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar')
    loc_ner = st.tag(loc)
    """
    might be faster starting from back to front
        'LOCATION' for English
        'I-LOC' for German
    """
    # code that glues named entities like 'New York' back together
    loc_tuples = [item[0] for item in loc_ner if 'LOCATION' in item]
    try:
        location = loc_tuples[0]
        if len(loc_tuples) > 1:
            for i in range(1,len(loc_tuples)):
                location += ' ' + loc_tuples[i]
    except IndexError:
        # if no location is specified
        return None
    return location
def extract_named_entities(threadName,output_collection,fetchedTweets):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    try:
        counter = 0
        mongo_list = []
        for fetchedTweet in fetchedTweets:
            counter += 1
            named_entities = []
            sentence = fetchedTweet['cleaned_text']
            neList = st.tag(sentence.split())
            for ne in neList:
                if ne[1] in ['PERSON', 'ORGANIZATION', 'LOCATION']:
                    named_entities.append((ne[0], ne[1]))
            fetchedTweet['named_entities'] = named_entities
            
            mongo_list.append(fetchedTweet)
            if counter % 100 == 0:
                logging.info("{}: Tweets processed: {} tweets".format(threadName, counter))
                write_mongo(threadName,output_collection,mongo_list)
                mongo_list = []
        if len(mongo_list) > 0:
            write_mongo(threadName,output_collection,mongo_list)
            mongo_list = []
    except Exception, e:
        print(e)
        sys.exit()
	def pretag(self):
		text=self.text
		st = StanfordNERTagger("/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz",\
	"/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/stanford-ner.jar")
		paragraphs = []
		paragraphs_string=''
		for x in text:
			paragraphs.append(str(x))
		paragraphs_string=' '.join(paragraphs)
		tagging=st.tag(paragraphs_string.split())
		symlist=[ 'company','corporation','multinational', 'Corporation','open-source','social', 'network','software','system']
		badlist=['integrated','first','check','computer','linear', 'solution','services','limited','tech','solutions','technology','open','model','on','applied','network', 'pricing','customers','social','big','subscribe','social','sign','monitor','software','machine','learning','compute','management','up']
		badlist_stem=[]
		self.badlist=badlist
		self.symlist=symlist
		for i in range(len(badlist)):
			badlist_stem.append(stemmer.stem(badlist[i]))
		self.badlist_stem=badlist_stem
		pretag1= [tag for (tag,label) in tagging if label in set(("ORGANIZATION","PERSON")) or (count_upper(tag)>=2 and len(tag)<11 ) ]
		pretag2=[tag for (tag,label) in tagging if tag.lower() in dict_1m or tag in dict_apps]
		pretag3=[tag for (tag,label) in tagging if tag.lower() in dict_tech]
		pretag= pretag1+pretag2+pretag3
		domain2synsets = defaultdict(list)
		synset2domains = defaultdict(list)
		self.pretag=pretag
Beispiel #5
0
def ner():
	os.environ['STANFORD_NER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer'
	os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/stanford-ner.jar'
	os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']

	eng_tagger = StanfordNERTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/classifiers/english.all.3class.distsim.crf.ser.gz')
	for x in content:
		print(eng_tagger.tag(x.split()))
def getEntityCount(tweet):
    # Use the Stanford NER Tagger
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
    # tokenize the tweet
    tokenized_text = word_tokenize(tweet)
    classified_text = st.tag(tokenized_text)
    countPerson =0
    for text in classified_text:
        if "PERSON" in text[1]:
            countPerson+=1 
    return countPerson
def NERTagging(text):
    log_file = open("Dump/log/Main_output.txt", "a")
    st = StanfordNERTagger('resources/ner/classifiers/english.all.3class.distsim.crf.ser.gz',
					   'resources/ner/stanford-ner.jar',
					   encoding='utf-8')
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    log_file.write('NER \n %s \n' % classified_text)
    print(classified_text)
    log_file.close()
    return
Beispiel #8
0
def nltk_ner(remainders):
	st = StanfordNERTagger('../stanford-ner/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') 
	for item in remainders:
		name = ""
		tagged = st.tag(item.split())
		for entity in tagged:
			if entity[1] == u'PERSON':
				name += (entity[0].title() + ' ')
		if name: 
			return True, name, item
		else:
			return False, name, item
Beispiel #9
0
def get_namedentities(text):
  """
  Returns named entities in text using StanfordNERTagger
  """
  st = StanfordNERTagger('utils/english.conll.4class.caseless.distsim.crf.ser.gz','utils/stanford-ner.jar')   
  ner_tagged = st.tag(text.lower().split())     
  
  named_entities = []
  if len(ner_tagged) > 0:
    for n in ner_tagged:
      if n[1]!='O':
        named_entities.append(remove_punctuation(n[0]))

  named_entities = [n for n in named_entities if n] 
  return named_entities
def trial1():
    """
    Just to make sure we're not screwing everything up.
    :return:
    """
    st = StanfordNERTagger('/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/annotated-cities-model.ser.gz',
                           '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/stanford-ner.jar',
                           encoding='utf-8')

    text = 'While in France, Mrs. Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)

    print(classified_text)
def classify_text(text):
    """Using the 3-class Stanford Named Entity Recognition model, classify each
       word in the input text as a PERSON, LOCATION, ORGANIZATION, or O (for
       other)."""

    directory = "C:/Users/liabbott/Documents/Projects/CBP OIT/stanford_ner/"
    mod = "classifiers/english.all.3class.distsim.crf.ser.gz"
    tag = "stanford-ner.jar"
    path_to_model = os.path.normpath(directory + mod)
    path_to_tagger = os.path.normpath(directory + tag)
    st = StanfordNERTagger(path_to_model, path_to_tagger, encoding='utf-8')

    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)

    return classified_text
    def __init__(self, use_stanford=False, NER_model=None, NER_tagger=None, POS_model=None, POS_tagger=None):
        """The initializer of the class

        :param NER_model: NER model path
        :param NER_tagger: NER tagger path
        :param POS_model: POS model path
        :param POS_tagger: POS tagger path
        :param use_stanford: boolean, if using stanford NER and POS tagging

        """
        self.NER_model = NER_model
        self.NER_tagger = NER_tagger
        self.POS_model = POS_model
        self.POS_tagger = POS_tagger
        self.use_stanford = use_stanford

        if use_stanford:
            if NER_model is None or NER_tagger is None or POS_model is None or POS_tagger is None:
                sys.exit("tagging initialization: Stanford models and taggers" " have to be provided!")
            else:
                self.post = StanfordPOSTagger(self.POS_model, self.POS_tagger).tag
                self.nert = StanfordNERTagger(self.NER_model, self.NER_tagger).tag
        else:
            self.post = nltk.pos_tag
            self.nert = nltk.ne_chunk
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None):
    """
    Extract entities using the Stanford NER tagger.
    Must pass in the path to the tagging model and jar as downloaded from the
    Stanford Core NLP website.
    """
    results = defaultdict(lambda: defaultdict(list))
    fileids = fileids or corpus.fileids()
    tagger  = StanfordNERTagger(model, jar)
    section = section

    for fileid in fileids:
        if section is not None:
            text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1])
        else:
            text  = corpus.words(fileid)

        chunk = []

        for token, tag in tagger.tag(text):
            if tag == 'O':
                if chunk:
                    # Flush the current chunk
                    etext =  " ".join([c[0] for c in chunk])
                    etag  = chunk[0][1]
                    chunk = []

                    # if etag == 'PERSON':
                    #     key = 'persons'
                    # elif etag == 'ORGANIZATION':
                    #     key = 'organizations'
                    # elif etag == 'LOCATION':
                    #     key = 'locations'
                    # else:
                    #     key = 'other'

                    if etag == 'LOCATION':
                        key = 'locations'
                    else:
                        key = 'other'
                    results[fileid][key].append(etext)

            else:
                # Build chunk from tags
                chunk.append((token, tag))

    return results
def main():
    parser = StanfordParser(path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar)
    st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
    raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution."
    sent = word_tokenize(raw_sent)
    ne_tuple = st.cur_tag(sent)  # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized)
    print ne_tuple
    
    print parser.raw_parse(raw_sent).next()

    return
    # find name entity
    f = 0
    ne_list = []
    for (ne, label) in ne_tuple:
        if label == 'PERSON':
            f = 1
        if f and label != 'PERSON':
            break
        if f:
            ne_list.append(ne)
    # print ne_list

    init_file(main_tree)
                    ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ?
    # try head to ask who/what
    pattern = "S < NP=np"
    head = check_output(['bash',  ###add bash !!!!
                         tregex_path,
                         '-s',
                         pattern,
                         init_tree_file])
    print head

    def get_main_verbs(tree):
        pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)'
        main_verbs = check_output(['bash',  ###add bash !!!!
                                   tregex_path,
                                   '-s',
                                   pattern,
                                   init_tree_file])
        print main_verbs
        main_verbs = main_verbs.split('\n')[:-1]
        main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs]
        return main_verbs
Beispiel #15
0
    def __init__(self, language="en"):
        from nltk.tag import StanfordNERTagger

        self.__stanfordJar = "%s/dist/stanford-ner.jar" % self.__currentDirectory
        self.__classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" % (self.__currentDirectory,)
        self.__tagger = StanfordNERTagger( self.__classifier,
                                           self.__stanfordJar,
                                           encoding="utf-8")
        self.__namedEntitiesFinder = NERFinder(language=language)
Beispiel #16
0
def html_ner(content):
    st = StanfordNERTagger(
        './lib/classifiers/english.all.3class.distsim.crf.ser.gz',
        './lib/stanford-ner-3.5.2.jar')
    soup = BeautifulSoup(content, "html.parser")
    for script in soup(["script", "style", "sup"]):
        script.extract()
    tokenised_sents = list(soup.stripped_strings)
    tokenised_words = [wordpunct_tokenize(sent) for sent in tokenised_sents]
    tagged_sents = [st.tag(sent) for sent in tokenised_words]

    result = list()

    for sent in tagged_sents:
        for tag, chunk in groupby(sent, lambda x: x[1]):
            if tag != 'O':
                result.append((tag, ' '.join(w for w, t in chunk).encode('utf-8').strip()))
    return result
Beispiel #17
0
    def sanitize_result(self, text):
        
        
        st = StanfordNERTagger('C:\Python27\stanford_ner\classifiers\english.all.3class.distsim.crf.ser.gz',
                                                   'C:\Python27\stanford_ner\stanford-ner.jar',
                                                   encoding='utf-8')
        tokenized_text = word_tokenize(self.capitalize_first_letter(text))
        classified_text = st.tag(tokenized_text)

        named_entities = self.get_continuous_chunks(classified_text)
        named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
        named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]


        for tag, chunk in groupby(named_entities_str_tag, lambda x:x[1]):
            if tag == "PERSON":
                #print "%-12s"%tag, " ".join(w for w, t in chunk)
                name = " ".join(w for w, t in chunk)
               
        return name
Beispiel #18
0
    def __init__(self, model_num):
        if model_num == 3:
            pathname = config.STANFORD_3CLASS
        elif model_num == 4:
            pathname = config.STANFORD_4CLASS
        elif model_num == 7:
            pathname = config.STANFORD_7CLASS
        else:
            raise Exception('No model for:', model_num)

        self.tagger = StanfordNERTagger(pathname, config.STANFORD_NER_JAR)
Beispiel #19
0
    def init_ner_mapper(self):
        # load the StanfordNER Tagger
        # model_ger = "/opt/Projects/nlp/stanford-ner-2015-04-20/classifiers" \
        #             "/german/german.hgc_175m_600.crf.ser.gz"
        # stanford_jar = "/opt/Projects/nlp/stanford-ner-2015-04-20/stanford" \
        #                "-ner.jar"
        model_ger = "/home/janrn/ner/german.hgc_175m_600.crf.ser.gz"   # earkdev
        stanford_jar = "/home/janrn/ner/stanford-ner.jar"              # earkdev

        self.tagger = StanfordNERTagger(model_ger, stanford_jar,
                                        encoding="utf-8",
                                        java_options='-mx4096m',
                                        )
def main():

    # training standford NER tagger

    st = StanfordNERTagger(
        "/home/viswanath/Downloads/stanford-ner-2014-08-27/classifiers/english.conll.4class.distsim.crf.ser.gz",
        "/home/viswanath/Downloads/stanford-ner-2014-08-27/stanford-ner.jar",
        encoding="utf-8",
    )

    fname = "/home/viswanath/data/resume/test_data/01.txt"
    fp = open(fname, "r")
    text = fp.read()
    #  print text
    lstemp = cleanse_data(text)
    list_ner_out = st.tag(lstemp.split())
    #   list_ner_out = st.tag(text.split())
    #   print list_ner_out
    # list_out = st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

    fp = open("ner_temp.txt", "w")
    #    fp.write(list_ner_out)
    for item in list_ner_out:
        fp.write("{0}\n".format(item))
    fp.close()

    ne_tagged_sent = list_ner_out

    ne_tree = stanfordNE2tree(ne_tagged_sent)

    print ne_tree

    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == Tree:  # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    print ne_in_sent
Beispiel #21
0
    def __init__(self):
        '''
        Assign some class variables; start scanning process.
        '''

        self.status = 0

        self.input_loc = config.input_area
        self.processing_loc = config.processing_area
        self.output_loc = config.output_area

        self.tagger = StanfordNERTagger(config.german_ner, config.stanford_jar, encoding='utf-8')

        self.scan()
Beispiel #22
0
	def __init__(self, person):
		
		self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
		classifier = "ner/classifiers/" + "english.all.3class.distsim.crf.ser.gz"
		jar = "ner/stanford-ner-3.4.jar"
		self.tagger = StanfordNERTagger(classifier, jar)
		self.ap = []
		self.person = person
		self.query = Sparql(person)
		self.setSpouse()
		self.setMother()
		self.setFather()
		self.setFullName()
		self.setAbstract()
		self.setAbstractInfo()
def trial2():
    """
    Let's try using the nltk and one of the readability texts
    :return:
    """
    pretrained_model_path = '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/www-experiments/stanford-ner-2015-12-09/'
    all3class = pretrained_model_path+'classifiers/english.all.3class.distsim.crf.ser.gz'
    conll4class = pretrained_model_path+'classifiers/english.conll.4class.distsim.crf.ser.gz'
    muc7class = pretrained_model_path+'classifiers/english.muc.7class.distsim.crf.ser.gz'
    st_muc = StanfordNERTagger(muc7class,
                           pretrained_model_path+'stanford-ner.jar',
                           encoding='utf-8')
    st_conll = StanfordNERTagger(conll4class,
                           pretrained_model_path+'stanford-ner.jar',
                           encoding='utf-8')
    st_3class = StanfordNERTagger(all3class,
                                 pretrained_model_path + 'stanford-ner.jar',
                                 encoding='utf-8')
    annotated_cities_file = '/Users/mayankkejriwal/datasets/memex-evaluation-november/annotated-cities/ann_city_title_state_1_50.txt'
    TP = 0
    FP = 0
    FN = 0
    with codecs.open(annotated_cities_file, 'r', 'utf-8') as f:
        for line in f:
            obj = json.loads(line)
            text = obj['high_recall_readability_text']
            tokenized_text = word_tokenize(text)
            classified_text_muc = st_muc.tag(tokenized_text)
            classified_text_conll = st_conll.tag(tokenized_text)
            classified_text_3class = st_3class.tag(tokenized_text)
            tagged_locations = set()

            correct_locations = _build_locations_true_positives_set(obj, ['correct_cities','correct_states','correct_cities_title'])
            # if 'correct_country' in obj and obj['correct_country']:
            #     correct_locations = correct_locations.union(set(TextPreprocessors.TextPreprocessors._preprocess_tokens
            #                                                     (obj['correct_country'].split(),['lower'])))
            for i in range(0, len(classified_text_muc)):
                tag_muc = classified_text_muc[i]
                tag_conll = classified_text_conll[i]
                tag_3class = classified_text_3class[i]
                if str(tag_3class[1]) == 'LOCATION':
                # if str(tag_muc[1]) == 'LOCATION' or str(tag_conll[1]) == 'LOCATION' or str(tag_3class[1]) == 'LOCATION':
                    tagged_locations.add(tag_3class[0].lower())
            # print tagged_locations
            # print correct_locations
            TP += len(tagged_locations.intersection(correct_locations))
            FP += (len(tagged_locations)-len(tagged_locations.intersection(correct_locations)))
            FN += (len(correct_locations)-len(tagged_locations.intersection(correct_locations)))
            # print classified_text[0][1]
            # print(classified_text)
            # break
    print 'TP, FP, FN are...'
    print TP
    print FP
    print FN
Beispiel #24
0
class StanfordTagger(object):
    """
    Wrapper for the Stanford NER Tagger
    """
    __classifier = ""
    __stanfordJar = ""
    def __init__(self, data=None):
        from nltk.tag import StanfordNERTagger

        self.__tagger = StanfordNERTagger(self.__classifier, self.__stanfordJar, encoding="utf-8")

    def tags(self, raw_text):
        """
        Extract named entities from a raw text
        :raw_text: The raw text
        """
        from nltk.tokenize import word_tokenize

        token_text = word_tokenize(raw_text)
        ne_tags = self.__tagger.tags(token_text)
        return(ne_tags)
Beispiel #25
0
A big benefit of the Stanford NER tagger is that is provides us with a few different models for pulling out named entities. We can use any of the following:

3 class model for recognizing locations, persons, and organizations
4 class model for recognizing locations, persons, organizations, and miscellaneous entities
7 class model for recognizing locations, persons, organizations, times, money, percents, and dates


################################################################################################

The parameters passed to the StanfordNERTagger class include:

Classification model path (3 class model used below)
Stanford tagger jar file path
Training data encoding (default of ASCII)

"""

from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
					   '/usr/share/stanford-ner/stanford-ner.jar',
					   encoding='utf-8')

text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

print(classified_text)
Beispiel #26
0
from nltk.tag import StanfordNERTagger

st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz")
print st.tag(
    "Rami Eid is studying at Stony Brook University in NY. And he wants to work at CERN in Switzerland in Europe .".split()
)
def stanford_tagger(token_text):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz',
                           'stanford-ner.jar')
    ne_tagged = st.tag(token_text)
    return ne_tagged
Beispiel #28
0
################################################################################
## This module contains functions that take sentences as input and returns time
## and location information.
################################################################################

print(":::::::::::::Loading time and location tagging Libraries::::::::::::::\n")
#Time Tagger libraries
import json
from sutime import SUTime
jar_files = "./Resources/python-sutime-master/jars/"
sutime = SUTime(jars=jar_files, mark_time_ranges=True)

# NER Libraries
from nltk.tokenize import word_tokenize
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('./Resources/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz',\
						'./Resources/stanford-ner-2018-02-27/stanford-ner.jar', encoding='utf-8')

print("\n:::::::::::::All libraries loaded:::::::::::::\n\n")



def get_time(sentence):
	su_out = json.loads(json.dumps(sutime.parse(sentence), sort_keys=True, indent=4))

	if not len(su_out) == 0:
		time_list = [x['value'] for x in su_out if type(x['value'])==type('')]
		return ", ".join(time_list)
	else:
		return None

Beispiel #29
0
class StanfordTagger(object):
    """
    Wrapper for the Stanford NER Tagger
    """
    __currentDirectory = os.path.dirname(os.path.realpath(__file__)) # Current directory
    __classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz"
    __stanfordJar = "%s/dist/stanford-ner.jar"

    def __init__(self, language="en"):
        from nltk.tag import StanfordNERTagger

        self.__stanfordJar = "%s/dist/stanford-ner.jar" % self.__currentDirectory
        self.__classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" % (self.__currentDirectory,)
        self.__tagger = StanfordNERTagger( self.__classifier,
                                           self.__stanfordJar,
                                           encoding="utf-8")
        self.__namedEntitiesFinder = NERFinder(language=language)

    def __tags(self, raw_text):
        """
        Return the named entities tokens given a raw text
        :raw_text: Raw text
        """
        from nltk.tokenize import word_tokenize

        if isinstance(raw_text, str):
            # Decode to utf-8
            raw_text = raw_text.decode('utf-8')
        # Tokenize the string
        token_text = word_tokenize(raw_text)
        # Retrieve the named entities from the tokens
        ne_tags = self.__tagger.tag(token_text)
        return(ne_tags)

    def __bio_tagger(self, ne_tagged):
        """
        Return BIO tags from named entities
        :ne_tagged: name_entities tokens
        """
        bio_tagged = []
        prev_tag = "O"
        for token, tag in ne_tagged:
            if tag == "O": #O
                bio_tagged.append((token, tag))
                prev_tag = tag
                continue
            if tag != "O" and prev_tag == "O": # Begin NE
                bio_tagged.append((token, "B-"+tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag == tag: # Inside NE
                bio_tagged.append((token, "I-"+tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
                bio_tagged.append((token, "B-"+tag))
                prev_tag = tag
        return bio_tagged

    def __generate_tree(self, bio_tagged):
        """
        Tranform a list of tags in a tree
        """
        from nltk import pos_tag
        from nltk.chunk import conlltags2tree


        tokens, ne_tags = zip(*bio_tagged)
        pos_tags = [pos for token, pos in pos_tag(tokens)]

        conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)]
        ne_tree = conlltags2tree(conlltags)
        return ne_tree

    def __getEntities(self, taggedWords):
        """
        It returns the entities from a list of tagged words (NER or POS) after generating the syntax tree
        """
        bio_tagged = self.__bio_tagger(taggedWords)
        stanford_tree = self.__generate_tree(bio_tagged=bio_tagged)

        entities = self.__namedEntitiesFinder.getEntities(stanford_tree)
        return entities

    def getEntitiesByTags(self, pos_tagged_words):
        """
        Get entities from a list of word tagged with POS Tags.
        """
        entities = self.__getEntities(taggedWords=pos_tagged_words)
        return entities

    def getEntities(self, raw_text):
        """
        Get the entities from a raw text
        """
        ne_entities = self.__tags(raw_text=raw_text)
        entities = self.__getEntities(taggedWords=ne_entities)
        return entities
Beispiel #30
0
def ch_nertagger(str):
    chi_tagger = StanfordNERTagger(model_filename=r'E:\tools\stanfordNLTK\jar\classifiers\chinese.misc.distsim.crf.ser.gz',path_to_jar=r'E:\tools\stanfordNLTK\jar\stanford-ner.jar')
    for word, tag in chi_tagger.tag(str.split()):
        print(word,tag)
Beispiel #31
0
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('C:\\Users\\Amlendu11\\Downloads\\stanford-ner-2015-12-09 (1)\\stanford-ner-2015-12-09\\classifiers\\english.all.3class.distsim.crf.ser.gz',
					   'C:\\Users\\Amlendu11\\Downloads\\stanford-ner-2015-12-09 (1)\\stanford-ner-2015-12-09\\stanford-ner-3.6.0.jar',
					   encoding='utf-8')

text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

print(classified_text)
Beispiel #32
0
#!/usr/bin/env python
# -*- encoding=utf-8 -*-

import re

from nltk.tag import StanfordNERTagger
import jieba

from extract_time import *

chinese_ner = StanfordNERTagger('chinese.misc.distsim.crf.ser.gz')

entity_class = {"PERSON": 0, "GPE": 1, "MISC": 2, "ORGANIZATION": 3, "O": 4}


def extract_entity(s):
    tokens = list(jieba.cut(s))
    r = chinese_ner.tag(tokens)

    entity_dict = {}
    pre_cls = ""
    terms = []
    for token, cls in r:
        # print type(token)
        print "%s, %s" % (token, cls)
        if cls != pre_cls:
            if pre_cls != "":
                entity_dict.setdefault(pre_cls, [])
                entity_dict[pre_cls].append(terms)
                terms = []
        terms.append(token)
from pymongo import MongoClient
import json
from bson import json_util
import nltk
import json
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

#Conexion a MongoDB
cliente = MongoClient()  #Inicializar objeto
cliente = MongoClient('127.0.0.1', 27017)  #Indicar parametros del servidor
bd = cliente.taller4  #Seleccionar Schema
coleccion = bd.body_respuestas  #Seleccionar Coleccion

st = StanfordNERTagger(
    '/home/xubuntu/Taller4/nueva/classifiers/english.muc.7class.distsim.crf.ser.gz',
    '/home/xubuntu/Taller4/nueva/stanford-ner.jar',
    encoding='utf-8')

#consulta1= coleccion.find({"items.question_id":{"$gte":60000}})
pregunta = 0

p = 0
y = 0
k = 0
l = 0
m = 0
n = 0
entidades = []
try:
    bd = cliente.taller4  #Seleccionar Schema
    coleccion = bd.body_respuestas  #Seleccionar Coleccion
Beispiel #34
0
from nltk.parse.stanford import StanfordDependencyParser, StanfordParser
from nltk.tag import StanfordNERTagger
import os

path_ner = "/home/pongsakorn/Desktop/stanford-ner-2017-06-09"
path_parser = "/home/pongsakorn/Desktop/stanford-parser-full-2017-06-09"
path_postagger = "/home/pongsakorn/Desktop/stanford-postagger-full-2017-06-09"

class_path_cmd = ".:{}:{}:{}".format(path_ner, path_parser, path_postagger)

path_postagger_model = "/home/pongsakorn/Desktop/stanford-postagger-full-2017-06-09/models"
path_ner_clf = "/home/pongsakorn/Desktop/stanford-ner-2017-06-09/classifiers"

class_model_cmd = "{}:{}:{}".format(path_postagger_model, path_parser, path_ner_clf)
#print(class_path_cmd)
#print(class_model_cmd)

os.environ['CLASSPATH'] = class_path_cmd
os.environ['STANFORD_MODELS'] = class_model_cmd


model_path = '/home/pongsakorn/Desktop/stanford-parser-full-2017-06-09/englishPCFG.ser.gz'

stanford_dependency_parser = StanfordDependencyParser(model_path=model_path)
stanford_parser = StanfordParser(model_path=model_path)

stanford_ne_tagger = StanfordNERTagger('../../stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz', 
                                      path_to_jar='../../stanford-ner-2017-06-09/stanford-ner.jar')
Beispiel #35
0
# -*- coding: utf-8 -*-

import sys
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import nltk
import json
import scale
import pickle

st = StanfordNERTagger(
    '../../stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz',
    '../../stanford-ner-2018-10-16/stanford-ner.jar',
    encoding='utf-8')


def tag_answers_tag(fw, ner, pos, tag1):
    words = []
    tags = []
    answer = []
    i = 0
    c = 0
    a = 1
    while i < len(ner):
        if ner[i][0][0].isupper():
            y = "U"
        else:
            y = "L"
        if ner[i][1] == tag1 and a:
            z = 'A '
            c = 1
Beispiel #36
0
        print("Verb::->" + token.text)
    elif (token.pos_ == 'NOUN'):
        # print token
        print("NOUN-->" + token.text)
    elif (token.pos_ == 'PROPN'):
        # print token
        print("PROPN-->" + token.text)

##############################################

article = "Attackers compromise Microsoft Exchange servers to hijack internal email chains Pakistan , India , US"

import nltk
from nltk.tag import StanfordNERTagger

print('NTLK Version: %s' % nltk.__version__)

stanford_ner_tagger = StanfordNERTagger(
    'C:/Users/TahsinAsif/OneDrive - CYFIRMA INDIA PRIVATE LIMITED/antuitBackUp@3march/Asif/AI/NameEntityRecog/stanford_ner/'
    + 'classifiers/english.muc.7class.distsim.crf.ser.gz',
    'C:/Users/TahsinAsif/OneDrive - CYFIRMA INDIA PRIVATE LIMITED/antuitBackUp@3march/Asif/AI/NameEntityRecog/stanford_ner/'
    + 'stanford-ner-3.9.2.jar')

results = stanford_ner_tagger.tag(article.split())

print('Original Sentence: %s' % (article))
for result in results:
    tag_value = result[0]
    tag_type = result[1]
    if tag_type != 'O':
        print('Type: %s, Value: %s' % (tag_type, tag_value))
class Tagger:

    def __init__(self):
        self.backoff = self.backoff_tagger(backoff=DefaultTagger('NN'))
        self.st = StanfordNERTagger(
            'stanfordNERJars/classifiers/english.all.3class.distsim.crf.ser.gz',
            'stanfordNERJars/stanford-ner.jar',
            encoding='utf-8')
        if os.path.exists("out/"):
            shutil.rmtree('out/')

    train_sents = brown.tagged_sents()[:48000]

    def backoff_tagger(self, backoff=None):
        """
        Used to tag text using a more accurate backoff tagger
        :param backoff: the current backoff
        :return: a backoff tagger
        """
        for cls in [UnigramTagger, BigramTagger, TrigramTagger]:
            backoff = cls(self.train_sents, backoff=backoff)
        return backoff

    def ner_stanford(self, text, entity):
        """
        Gets a list of specific entities from text
        :param text: the text we want to search in
        :param entity: the entity to extract
        :return: the list of entities
        """
        tokenized_text = word_tokenize(text)
        classified_text = self.st.tag(tokenized_text)
        results = []
        for tag, chunk in groupby(classified_text, lambda x: x[1]):
            if tag == entity:
                results.append(" ".join(w for w, t in chunk))
        return set(results)


    @staticmethod
    def tag_paragraphs(text):
        """
        Tags paragraphs in text
        :param text: text to be tagged
        :return:
        """
        text = '\n\n{}\n\n'.format(text.strip('\n'))
        para = re.compile(paragraphRegex)
        for match in para.finditer(text):
            paragraph = match.group(1)
            if paragraph:
                text = text.replace(paragraph, '<paragraph>{}</paragraph>'.format(paragraph))

        return text.strip()

    def tag_sentences(self, text):
        """
        Tags sentences in the text
        :param text: text to be tagged
        :return: tagged text
        """
        # text_parts = self.split_on_tags(text, 'paragraph')
        text_parts = re.split(r'</?{}>'.format('paragraph'), text)
        sentences = []
        for part in text_parts:
            p = part.strip()
            s = sent_tokenize(p)
            sentences.extend(s)
            # sentences.extend(sent_tokenize(part.strip()))

        # filter everything that is not a proper sentence
        temp = []
        for sent in sentences:
            res = re.match(not_sentence_regx_str, sent)
            if res is not None:
                temp.append(sent)

        # sentences = list(filter(lambda s: re.match(not_sentence_regx_str, s), sentences))
        for sent in temp:
            text = text.replace(sent, '<sentence>{}</sentence>'.format(sent))

        return text

    @staticmethod
    def tag_times(stime, etime, text):
        """
        Tags times in the text
        :param stime: the start time
        :param etime: the end time
        :param text: the text to tag
        :return: the text tagged with times
        """
        if not etime and not stime:
            return text
        textHolder = text
        time_regx = re.compile(time_regx_str)

        for time_str in set(time_regx.findall(textHolder)):
            time = time_parser.parse(time_str).time()
            if time_parser.parse(stime).time() == time:
                textHolder = textHolder.replace(time_str, '<stime>{}</stime>'.format(time_str))

            elif etime:
                if time_parser.parse(etime).time() == time:
                    textHolder = textHolder.replace(time_str, '<etime>{}</etime>'.format(time_str))
        return textHolder

    @staticmethod
    def tag_locations(locations, text):
        """
        Tags locations in the text
        :param locations: locations to be tagged
        :param text: text to be tagged
        :return: the text with locations tagged
        """
        for loc in locations:
            compiled = re.compile(re.escape(loc), flags=re.IGNORECASE)
            text = re.sub(compiled, '<location>' + loc + '</location>', text)

        return text

    @staticmethod
    def tag_speakers(text, speakers):
        """
        Tags speakers in the text
        :param text: text to tag
        :param speakers: speakers to tag
        :return: the tagged text
        """
        for spk in speakers:

            insensitive_spk = re.compile(r'(\b({})\b|[.?!]({})\b|\(({})\))'.format(re.escape(spk), re.escape(spk),
                                                                                   re.escape(spk), re.escape(spk)),
                                         re.IGNORECASE)
            try:
                name = re.search(insensitive_spk, text).group(1)
                clean = name.strip()
                text = text.replace(name, '<speaker>' + clean + '</speaker>')
            except:
                pass

        return text

    def tag_seminar(self, path, directory, extractor):
        """
        Tags seminar with all previously found data and writes the data to a file.
        :param path: the path to the untagged files
        :param directory: the directory they are in
        :param extractor: the extractor class to extract data
        """
        for file in tqdm(os.listdir(directory)):
            filename = os.fsdecode(file)
            if filename.endswith(".txt"):
                with open(path + filename, 'r', encoding='utf-8') as f:
                    placeholder = f.read().strip('\n -*')

                    # Splits the text into header and body
                    try:
                        header, body = re.search(header_body_regx_str, placeholder).groups()
                    except:
                        continue

                    header = header.rstrip('\n')

                    stime, etime = extractor.extract_time(header)
                    locations = extractor.extract_location(header, body, self)
                    speakers = extractor.extract_speaker(header, body, self)

                    body = self.tag_paragraphs(body)
                    body = self.tag_sentences(body)

                    seminar = header + '\n\n' + body
                    seminar = self.tag_times(stime, etime, seminar)
                    seminar = self.tag_speakers(seminar, speakers)
                    seminar = self.tag_locations(locations, seminar)

                    out_location = "out/"
                    Utils.mkdir_p(out_location)
                    out = open(out_location + filename, "w+")
                    out.write(seminar)
                    out.close()
                continue
from nltk.tag import StanfordNERTagger
import string, sys

from settings import config, db

# Variables
config = config.getConfig()
db = db.getDB()

st = StanfordNERTagger(
    "classifiers/english.conll.4class.distsim.crf.ser.gz",
    "classifiers/stanford-ner.jar"
)  #"/usr/share/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz", "/usr/share/stanford-ner/stanford-ner.jar")

# Declare the collections
articles = db.articles
indicators = db.indicators
analysis = db.analysis
stanford_named_ents = db.stanford_named_entities


def extract():
    count = 0
    stanford_named_ents.drop()

    for article in articles.find(no_cursor_timeout=True):
        try:
            text = article['article_text']
            url = article['url']

            if not (stanford_named_ents.find_one({"url": url})):
def normalize(text):
    return stem_tokens(
        nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')


def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0, 1]


st = StanfordNERTagger(
    './static/other_files/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
    './static/other_files/stanford-ner-2014-06-16/stanford-ner.jar',
    encoding='utf-8')


def disambiguate(text_document_name, main_directory, download_directory,
                 tag_search):
    # tag_search is either PERSON or LOCATION
    if tag_search not in ('PERSON', 'LOCATION'):
        raise ValueError(
            "tag_search parameter can only have values 'PERSON' or 'LOCATION' "
        )
    tagged_persons = []
    with codecs.open(os.path.join(main_directory, text_document_name),
                     "r") as text_document:
        text = text_document.read()
        tokenized = nltk.word_tokenize(text)
	def namedEntityRecognize(self, sentence):
	#perform NER on the sentence - returns a list of tuples of (word, ne-recognized tags)
		st = StanfordNERTagger(self.modelPath)
		print st.tag(sentence.split())
		return st.tag(sentence.split())
Beispiel #41
0
	outfile1.close()
	outfile2.close()

	# Get the relations between team and coach
	print "Starts Extracting Coaches!"
	q = open("coach.rq").read()
	results = G.query(q)
	outfile = open("../output/coach.tsv", "w")
	for row in results:
		outfile.write("%s\t%s\n" % (row[0], row[1]))
	outfile.close()

	# Get the entities of stadium
	print "Start Identifying Stadiums!"
	StanfordNERPath = './stanford-ner'
	st = StanfordNERTagger(StanfordNERPath + '/classifiers/english.all.3class.distsim.crf.ser.gz', StanfordNERPath + '/stanford-ner.jar')

	indicator = set([u'is', u'was', u'are', u'were'])
	noun_tag = set(['NN', 'NNS', 'NNP', 'NNPS'])

	q = open("withDoc.rq").read()
	results = G.query(q)
	outfile = open("../output/stadium.tsv", "w")
	count = 0
	for row in results:
		count += 1
		if count % 10 == 0:
			print "%d documents has been processed" % count
		text = row[1]
		tags = st.tag(word_tokenize(text))
		firstSentence = ''
Beispiel #42
0
import ner
from nltk.tag import StanfordNERTagger

stanford_ner_dir = '/home/will/packages/stanfordNER/'
eng_model_filename = stanford_ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
my_path_to_jar = stanford_ner_dir + 'stanford-ner.jar'

st = StanfordNERTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar)
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

# tagger = ner.HttpNER(host='localhost', port=8080)
# tagger.get_entities("University of California is located in California, United States")
        tree = parse_trees[0]
        # get all NP trees and extract their leaves
        # Use help(nltk.tree.Tree) to find out which NLTK method you can use to do this
        for s in tree.subtrees(lambda tree: tree.label() == "NP"):
            print(s.leaves())

# In[29]:

# Named Entity Recognition (Using Stanford NLP)
from nltk.tag import StanfordNERTagger
import os
import pandas as pd
java_path = 'D:/jdk-13.0.2/bin/java.exe'
os.environ['JAVA_HOME'] = java_path
sner = StanfordNERTagger(
    '/home/lzanella/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
    path_to_jar='/home/lzanella/stanford-ner-2018-10-16/stanford-ner.jar')

named_entities = []

with open("/home/lzanella/ameliepoulain.txt",
          "r",
          encoding="utf8",
          errors='ignore') as infile:
    content = infile.read()
    sentences = nltk.sent_tokenize(content)

    counter = 0

    for sentence in sentences[0:3]:
        # print("\n SENTENCE %i : %s \n \n NE: \n"%(counter,sentence))
Beispiel #44
0
from Liwc_Trie_Functions import create_trie, get_liwc_categories
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from nltk import word_tokenize
from empath import Empath

df = pd.read_csv('dataset.csv')
ppd = pd.read_csv('pre_processed_dataset.csv')

ohe = OneHotEncoder()
lb = LabelEncoder()

# Using Stanford NER Tagger API
jar_n = '/localhome/debarshi/sarcasm/stanford-ner-2018-10-16/stanford-ner-3.9.2.jar'
model_n = '/localhome/debarshi/sarcasm/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz'
ner_tagger = StanfordNERTagger(model_n, jar_n, encoding='utf8')

# Using Stanford POS Tagger API
jar = '/localhome/debarshi/sarcasm/stanford-postagger-2018-10-16/stanford-postagger-3.9.2.jar'
model = '/localhome/debarshi/sarcasm/stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger'
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')

# Extracting POS Features
POS_snippets = []
for i in range(len(df['Snippet'])):
    POS_snippets.extend(pos_tagger.tag(word_tokenize(df['Snippet'][i])))
POS_snippets_type = [x[1] for x in POS_snippets]
POS_snippets_type = lb.fit_transform(POS_snippets_type)
pos_vec = ohe.fit_transform(np.reshape(POS_snippets_type, (-1, 1)))
pos_vec = pos_vec.todense()