def trial2():
    """
    Let's try using the nltk and one of the readability texts
    :return:
    """
    pretrained_model_path = '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/www-experiments/stanford-ner-2015-12-09/'
    all3class = pretrained_model_path+'classifiers/english.all.3class.distsim.crf.ser.gz'
    conll4class = pretrained_model_path+'classifiers/english.conll.4class.distsim.crf.ser.gz'
    muc7class = pretrained_model_path+'classifiers/english.muc.7class.distsim.crf.ser.gz'
    st_muc = StanfordNERTagger(muc7class,
                           pretrained_model_path+'stanford-ner.jar',
                           encoding='utf-8')
    st_conll = StanfordNERTagger(conll4class,
                           pretrained_model_path+'stanford-ner.jar',
                           encoding='utf-8')
    st_3class = StanfordNERTagger(all3class,
                                 pretrained_model_path + 'stanford-ner.jar',
                                 encoding='utf-8')
    annotated_cities_file = '/Users/mayankkejriwal/datasets/memex-evaluation-november/annotated-cities/ann_city_title_state_1_50.txt'
    TP = 0
    FP = 0
    FN = 0
    with codecs.open(annotated_cities_file, 'r', 'utf-8') as f:
        for line in f:
            obj = json.loads(line)
            text = obj['high_recall_readability_text']
            tokenized_text = word_tokenize(text)
            classified_text_muc = st_muc.tag(tokenized_text)
            classified_text_conll = st_conll.tag(tokenized_text)
            classified_text_3class = st_3class.tag(tokenized_text)
            tagged_locations = set()

            correct_locations = _build_locations_true_positives_set(obj, ['correct_cities','correct_states','correct_cities_title'])
            # if 'correct_country' in obj and obj['correct_country']:
            #     correct_locations = correct_locations.union(set(TextPreprocessors.TextPreprocessors._preprocess_tokens
            #                                                     (obj['correct_country'].split(),['lower'])))
            for i in range(0, len(classified_text_muc)):
                tag_muc = classified_text_muc[i]
                tag_conll = classified_text_conll[i]
                tag_3class = classified_text_3class[i]
                if str(tag_3class[1]) == 'LOCATION':
                # if str(tag_muc[1]) == 'LOCATION' or str(tag_conll[1]) == 'LOCATION' or str(tag_3class[1]) == 'LOCATION':
                    tagged_locations.add(tag_3class[0].lower())
            # print tagged_locations
            # print correct_locations
            TP += len(tagged_locations.intersection(correct_locations))
            FP += (len(tagged_locations)-len(tagged_locations.intersection(correct_locations)))
            FN += (len(correct_locations)-len(tagged_locations.intersection(correct_locations)))
            # print classified_text[0][1]
            # print(classified_text)
            # break
    print 'TP, FP, FN are...'
    print TP
    print FP
    print FN
Beispiel #2
0
def get_location(loc):
    """
    currently working only on my computer
    english Model
        english.muc.7class.distsim.crf.ser.gz
    german Models
        german.dewac_175m_600.crf.ser.gz
        german.hgc_175m_600.crf.ser.gz
    """
    # Named Entity Recognizer: recognizes named entities and assigns types like location, person, organization to the entity
    st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz',
    'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar')
    loc_ner = st.tag(loc)
    """
    might be faster starting from back to front
        'LOCATION' for English
        'I-LOC' for German
    """
    # code that glues named entities like 'New York' back together
    loc_tuples = [item[0] for item in loc_ner if 'LOCATION' in item]
    try:
        location = loc_tuples[0]
        if len(loc_tuples) > 1:
            for i in range(1,len(loc_tuples)):
                location += ' ' + loc_tuples[i]
    except IndexError:
        # if no location is specified
        return None
    return location
def extract_named_entities(threadName,output_collection,fetchedTweets):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    try:
        counter = 0
        mongo_list = []
        for fetchedTweet in fetchedTweets:
            counter += 1
            named_entities = []
            sentence = fetchedTweet['cleaned_text']
            neList = st.tag(sentence.split())
            for ne in neList:
                if ne[1] in ['PERSON', 'ORGANIZATION', 'LOCATION']:
                    named_entities.append((ne[0], ne[1]))
            fetchedTweet['named_entities'] = named_entities
            
            mongo_list.append(fetchedTweet)
            if counter % 100 == 0:
                logging.info("{}: Tweets processed: {} tweets".format(threadName, counter))
                write_mongo(threadName,output_collection,mongo_list)
                mongo_list = []
        if len(mongo_list) > 0:
            write_mongo(threadName,output_collection,mongo_list)
            mongo_list = []
    except Exception, e:
        print(e)
        sys.exit()
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
    stanford_tagger = StanfordNERTagger(
        model_name,
        stanford_ner_path,
        encoding='utf-8')

    text = sent_obj.sentence
    tokenized_text = list()
    spans = list()
    #Recover spans here
    for match in re.finditer("\S+", text):
        start = match.start()
        end = match.end()
        word = match.group(0)
        tokenized_text.append(word.rstrip(",.;:"))
        spans.append((start,end))
    tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
    classified_text = stanford_tagger.tag(tokenized_text)

    # Expand tuple to have span as well
    len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset
    final_class_and_span = list()
    for idx,tup in enumerate(classified_text):
        combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1])
        final_class_and_span.append(combined)

    #print(classified_text)
    sent_obj.tok_sent_with_crf_predicted_attribs[type] = final_class_and_span
    return sent_obj
	def pretag(self):
		text=self.text
		st = StanfordNERTagger("/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz",\
	"/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/stanford-ner.jar")
		paragraphs = []
		paragraphs_string=''
		for x in text:
			paragraphs.append(str(x))
		paragraphs_string=' '.join(paragraphs)
		tagging=st.tag(paragraphs_string.split())
		symlist=[ 'company','corporation','multinational', 'Corporation','open-source','social', 'network','software','system']
		badlist=['integrated','first','check','computer','linear', 'solution','services','limited','tech','solutions','technology','open','model','on','applied','network', 'pricing','customers','social','big','subscribe','social','sign','monitor','software','machine','learning','compute','management','up']
		badlist_stem=[]
		self.badlist=badlist
		self.symlist=symlist
		for i in range(len(badlist)):
			badlist_stem.append(stemmer.stem(badlist[i]))
		self.badlist_stem=badlist_stem
		pretag1= [tag for (tag,label) in tagging if label in set(("ORGANIZATION","PERSON")) or (count_upper(tag)>=2 and len(tag)<11 ) ]
		pretag2=[tag for (tag,label) in tagging if tag.lower() in dict_1m or tag in dict_apps]
		pretag3=[tag for (tag,label) in tagging if tag.lower() in dict_tech]
		pretag= pretag1+pretag2+pretag3
		domain2synsets = defaultdict(list)
		synset2domains = defaultdict(list)
		self.pretag=pretag
Beispiel #6
0
def ner():
	os.environ['STANFORD_NER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer'
	os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/stanford-ner.jar'
	os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']

	eng_tagger = StanfordNERTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/classifiers/english.all.3class.distsim.crf.ser.gz')
	for x in content:
		print(eng_tagger.tag(x.split()))
def NERTagging(text):
    log_file = open("Dump/log/Main_output.txt", "a")
    st = StanfordNERTagger('resources/ner/classifiers/english.all.3class.distsim.crf.ser.gz',
					   'resources/ner/stanford-ner.jar',
					   encoding='utf-8')
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    log_file.write('NER \n %s \n' % classified_text)
    print(classified_text)
    log_file.close()
    return
def getEntityCount(tweet):
    # Use the Stanford NER Tagger
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
    # tokenize the tweet
    tokenized_text = word_tokenize(tweet)
    classified_text = st.tag(tokenized_text)
    countPerson =0
    for text in classified_text:
        if "PERSON" in text[1]:
            countPerson+=1 
    return countPerson
Beispiel #9
0
def nltk_ner(remainders):
	st = StanfordNERTagger('../stanford-ner/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') 
	for item in remainders:
		name = ""
		tagged = st.tag(item.split())
		for entity in tagged:
			if entity[1] == u'PERSON':
				name += (entity[0].title() + ' ')
		if name: 
			return True, name, item
		else:
			return False, name, item
    def getEntities(self, sentence):

        st = StanfordNERTagger(
            '/home/ubuntu/english.all.3class.distsim.crf.ser.gz',
            '/home/ubuntu/stanford-ner.jar',
            encoding='utf-8')
        tokenized_text = word_tokenize(sentence)
        classified_text = [i[0] for i in st.tag(tokenized_text) if i[1] != "O"]
        stanfordNames = [
            i[0] for i in st.tag(tokenized_text) if i[1] == "PERSON"
        ]
        #text = "#BUSINESSofTheDay : #Bankruptcy Reveals Wild List: Who Weinstein Owes #Money To: Malia Obama, #DavidBowie, #MichaelBay http://a.msn.com/0C/en-us/BBKu067?ocid=st … #JudiDench #QuentinTarantinao #MichaelBay #DanielRadcliffe #RobertDeNiro #SexualAssault #MeToo #TimesUp #USA #EU #UK"
        #print sentence

        nlp = spacy.load('en')
        text = nlp(sentence.decode("utf-8"))
        #return [X.text for X in text.ents if X.label_ == "PERSON"]

        res = set([])
        for X in text.ents:
            if X.label_ == "PERSON":
                name = X.text.split()
                if len(name) > 1:
                    if name[0] in classified_text and name[
                            1] in classified_text:
                        res.add(X.text)
                    elif name[0] in classified_text:
                        res.add(name[0])
                    elif name[1] in classified_text:
                        res.add(name[1])
                else:
                    res.add(X.text)

        for name in stanfordNames:
            res.add(name)
        for word in sentence.split():
            if word[0] == "@":
                res.add(word[1:])

        return res
Beispiel #11
0
def main():
    f = open("ada_lovelace.txt")
    raw = f.read()
    tokens = nltk.word_tokenize(raw)
    tagged = nltk.pos_tag(tokens)
    nouns = [token for token, pos in tagged if pos.startswith('N')]
    lemmatizer = WordNetLemmatizer()
    noun_lemmas = []

    st = StanfordNERTagger(
        '/home/thomas/Downloads/stanford-ner-2018-02-27/classifiers/english.conll.4class.distsim.crf.ser.gz',
        '/home/thomas/Downloads/stanford-ner-2018-02-27/stanford-ner.jar')

    l = st.tag(raw.split())
    sorted_l = sorted(l, key=lambda x: x[1])
    print(sorted_l)

    #exercise2.3
    print("Exercise 2.3, nouns: ")
    n = st.tag(nouns)
    new = [tuple(s if s != "0" else "MISC" for s in tup) for tup in n]
    print(new)
Beispiel #12
0
    def sf_ner_tagger(self, para):

        stanford_ner_dir = '/Users/Rena/StandfordParserData/stanford-ner-2018-02-27/'
        eng_model_filename = stanford_ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
        my_path_to_jar = stanford_ner_dir + 'stanford-ner.jar'

        tagger = StanfordNERTagger(model_filename=eng_model_filename,
                                   path_to_jar=my_path_to_jar)

        pattern = '^[A-Za-z]+$'
        tokens = [t for t in para.split() if re.match(pattern, t)]

        return tagger.tag(tokens)
Beispiel #13
0
def entity_recognition_stanford(tokens, base_path):
    ''' Uses the Stanford NER model wrapped in NLTK'''
    classifier_model_path = base_path + '/english.muc.7class.distsim.crf.ser.gz'
    ner_jar_path = base_path + '/stanford-ner.jar'
    stanford_tagger = StanfordNERTagger(classifier_model_path,
                                        ner_jar_path,
                                        encoding='utf-8')
    ner_tagged = stanford_tagger.tag(tokens)
    replaced_text_list = [
        word[0] if word[1] == "O" else str('|' + word[1] + '|')
        for word in ner_tagged
    ]
    return ' '.join(replaced_text_list)
def find_NER_type(tokenized_text):
    """
    Named Entity Recognition.
    Finds the NER type of each token
    :param tokenized_text:
    :return:
    """
    from nltk.tag import StanfordNERTagger
    st = StanfordNERTagger('../exist-stanford-ner/resources/classifiers/english.all.3class.distsim.crf.ser.gz',
                           '../exist-stanford-ner/java/lib/stanford-ner-2015-04-20.jar', encoding='utf-8')
    classified_text = st.tag(tokenized_text)

    return classified_text
Beispiel #15
0
class NerAnalyzer:
    def __init__(self, modelFileFile, nerJarFile):
        self.st = StanfordNERTagger(modelFileFile, nerJarFile)

    def analyze(self, text):
        result = {}
        if (text):
            tags = self.st.tag(str(text).split())
            personTags = [item[0] for item in tags if item[1] == 'PERSON']
            if (any(personTags)):
                result['Person tags by NER'] = personTags

        return result
def recognizeEntities(line):
    try:
        tagger = StanfordNERTagger(
            "Stanford-NER/english.muc.7class.distsim.crf.ser.gz",
            "Stanford-NER/stanford-ner.jar",
            encoding="utf-8")
    except LookupError as e:
        print(e)
        raise SystemExit

    tokens = word_tokenize(line)
    taggedTokens = tagger.tag(tokens)
    return taggedTokens
def start(raw_text, username):
    # Standford Classifier and Standord NER Path
    stanford_classifier = '..\stanford-ner-2017-06-09\classifiers\english.muc.7class.distsim.crf.ser.gz'
    stanford_ner_path = '..\stanford-ner-2017-06-09\stanford-ner.jar'

    # Creating Tagger Object
    st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')

#    # Get the Input
#    with open('input.txt', 'r') as myfile:
#        raw_text=myfile.read().replace('\n', '')

    # Tokenize the Sentence and the Words
    sentence_tokenized = nltk.sent_tokenize(raw_text)
    word_tokenized = [nltk.word_tokenize(sent) for sent in sentence_tokenized]

    # Creates a list of lists that stores all the chunked words (with entities) for each and every tokenized sentence
    all_chunked_words = []
    for words in word_tokenized:
        chunked_words = GetChunkWords(st.tag(words))
        all_chunked_words.append(chunked_words)

    namedentities = []
    for words in word_tokenized:
        namedentities.append(WriteChunkedWordsToFile(st.tag(words)))
    
    with open('output\\namedentities.txt', 'w') as outfile:
        json.dump(namedentities, outfile) 
    
    # Creates a list of sentiment values for each sentence in the raw text
    # result = GetListOfSentimentsforeachSentence(raw_text)
    sentiment_list = GetListOfSentimentsforeachSentence(raw_text)
    
    # Creates a Dictionary of key-value pair, where key is the token and value is the average sentiment
    all_tokens_sentiment = GetListOfSentimentsforeachToken(all_chunked_words,sentiment_list)

    # Register Code     
    with open('output\\'+ username +'.txt', 'w+') as outfile:
        json.dump(all_tokens_sentiment, outfile)
def detection(document):
    # stanford's NER tagger 3 entity classification PERSON LOCATION ORGANIZATION O(other)
    stan_ner = StanfordNERTagger(
        '/home/iraklis/Desktop/PhDLocal/Tools/stanford-ner-2017-06-09/classifiers/'
        'english.all.3class.distsim.crf.ser.gz',
        '/home/iraklis/Desktop/PhDLocal/Tools/stanford-ner-2017-06-09/'
        'stanford-ner.jar',
        encoding='utf-8')
    tokenized_text = word_tokenize(document)
    classified_text = stan_ner.tag(tokenized_text)

    write_on_file(classified_text)

    # Chunking entities (for example first name with last name)
    previous_tuple = classified_text[0]
    entity_str = ""
    if previous_tuple[1] != 'O':
        entity_str += previous_tuple[0]

    store_entities = dict()
    store_entities["P"] = []
    store_entities["L"] = []
    store_entities["O"] = []

    for txt in classified_text[1:-1]:
        if txt[1] != 'O':
            if txt[1] == previous_tuple[1]:
                entity_str += " " + txt[0]
            else:
                entity_str = txt[0]
        else:
            if entity_str != "":
                store_entities[previous_tuple[1][0]].append(entity_str)
                entity_str = ""

        previous_tuple = txt

    # We are using OrderedDict to delete duplicates and preserve the insertion order
    store_entities["P"] = remove_entities(
        list(OrderedDict((x, True) for x in store_entities["P"]).keys()))
    store_entities["L"] = remove_entities(
        list(OrderedDict((x, True) for x in store_entities["L"]).keys()))
    store_entities["O"] = remove_entities(
        list(OrderedDict((x, True) for x in store_entities["O"]).keys()))

    # We transform the article_entity_dict_list in order to contain the entities of
    # each article sentence. The article entities will be calculated when it is needed.
    sentence_entities = entity_tools.transform_article_dict(
        document, store_entities)

    return sentence_entities
Beispiel #19
0
def remove_names(text):
    meaningful_words = []
    tagged_word = []
    tags = ['LOCATION', 'ORGANIZATION', 'PERSON']
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')

    for sentence in text:
        tagged_word.append(st.tag(sentence))

    for words in tagged_word:
        for word in words:
            if word[1] in tags:
                meaningful_words.append(word[0])
    return meaningful_words
Beispiel #20
0
def orgs(text):
	st = StanfordNERTagger('trained_model/stanford_ner/classifiers/english.all.3class.distsim.crf.ser.gz',
						   'trained_model/stanford_ner/stanford-ner.jar',
						   encoding='utf-8')

	tokenized_text = word_tokenize(text)
	classified_text = st.tag(tokenized_text)

	names_list = []
	for tag_tuple in classified_text:
		if tag_tuple[1] == 'ORGANIZATION':
			names_list.append(tag_tuple[0])

	return names_list
Beispiel #21
0
def text_tokinization(text):
    # Change the path according to your system
    stanford_classifier = 'C:\Farshid\programming\Java\stanford-ner-2016-10-31\classifiers\english.all.3class.distsim.crf.ser.gz'
    stanford_ner_path = 'C:\Farshid\programming\Java\stanford-ner-2016-10-31\stanford-ner.jar'

    # Creating Tagger Object
    st = StanfordNERTagger(stanford_classifier,
                           stanford_ner_path,
                           encoding='utf-8')

    tokenizedText = word_tokenize(text)
    classifiedText = st.tag(tokenizedText)

    return (classifiedText)
Beispiel #22
0
def reader():
    f=open("../chat_history/q.txt",'rb')
    file=open("../chat_history/ner_train2.txt",'ab')
    tagger=StanfordNERTagger('/Users/vishnuchopra/Project/stanford-ner-2017-06-09/classifiers/english.conll.4class.distsim.crf.ser.gz', '/Users/vishnuchopra/Project/stanford-ner-2017-06-09/stanford-ner.jar')
    for line in f:
        print(line)
        line=line.strip()
        tags=tagger.tag(line.split())
        for tag in tags:
            nert=tag[1]
            print(nert+' ')
            file.write(nert+' ')
        file.write('\n')
        print('\n')
Beispiel #23
0
def stanford_ner(words, args):
    start = time.time()
    """
    3 class: Location, Person, Organization
    4 class: Location, Person, Organization, Misc
    7 class: Location, Person, Organization, Money, Percent, Date, Time
    """
    ner_classifier_path = 'english.all.3class.distsim.crf.ser.gz'  # default 3 class

    if args.ner_class == 7:
        ner_classifier_path = 'english.muc.7class.distsim.crf.ser.gz'
    elif args.ner_class == 4:
        ner_classifier_path = 'english.conll.4class.distsim.crf.ser.gz'

    ner_classifier_full_path = os.path.join(stanford_ner_directory_path,
                                            'classifiers', ner_classifier_path)
    ner_jar_path = os.path.join(stanford_ner_directory_path,
                                'stanford-ner.jar')
    s_ner_tagger = StanfordNERTagger(ner_classifier_full_path,
                                     ner_jar_path,
                                     encoding='UTF-8')
    _tagged = s_ner_tagger.tag(words)

    # NLP BIO tags processing (B-beginning NE, I-inside NE, O-outside NE)
    bio_tagged = []
    prev_tag = "O"
    for token, tag in _tagged:
        if tag == "O":  # O
            bio_tagged.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O":  # Begin NE
            bio_tagged.append((token, "B-" + tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag:  # Inside NE
            bio_tagged.append((token, "I-" + tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag:  # Adjacent NE
            bio_tagged.append((token, "B-" + tag))
            prev_tag = tag

    # convert bio_tags to NLTK tree-like format
    tokens, ne_tags = zip(*bio_tagged)
    pos_tags = [pos for token, pos in get_pos_tags(tokens, args)]
    conlltags = [(token, pos, ne)
                 for token, pos, ne in zip(tokens, pos_tags, ne_tags)]
    ne_tree = conlltags2tree(conlltags)

    print 'Stanford NER took %.3f sec, NEs are:\n %s\n' % (
        time.time() - start, structure_ne(ne_tree))
Beispiel #24
0
def Locations_in_text(text):
    # This Function Tag every word in a text and return the words with "Location" tag as a list

    st = StanfordNERTagger(NER_FOLDER +
                           'english.all.3class.distsim.crf.ser.gz')

    TagedNamedEntity = st.tag(text.split())
    Locations = []
    for NamedEntity in TagedNamedEntity:
        name = NamedEntity[0]
        tag = NamedEntity[1]
        if tag == "LOCATION":
            Locations.append(name)
    return Locations
Beispiel #25
0
def extractNER(sentence):
    model = '/Users/pranavr/anaconda/lib/python3.5/site-packages/stanford-ner-2017-06-09/classifiers/english.muc.7class.distsim.crf.ser.gz'
    jar = '/Users/pranavr/anaconda/lib/python3.5/site-packages/stanford-ner-2017-06-09/stanford-ner.jar'

    st = StanfordNERTagger(model, jar, encoding='utf-8')

    tokenized_text = word_tokenize(sentence)
    classified_text = st.tag(tokenized_text)
    taggedWords = []
    for tup in classified_text:
        if (tup[1] != 'O'):
            taggedWords.append(tup)

    return taggedWords
Beispiel #26
0
def ret_loc_ner(tweet):
    st = StanfordNERTagger('stanford_ner/english.all.3class.distsim.crf.ser.gz',
					   'stanford_ner/stanford-ner.jar',
					   encoding='utf-8')

    tokenized_text = word_tokenize(tweet)
    classified_text = dict(st.tag(tokenized_text))

    list_of_locations = []
    for word, entity in classified_text.items():
        if entity == "LOCATION":
            list_of_locations.append(word)

    return list_of_locations
Beispiel #27
0
    def get_ner_document_tags(self, document_id):

        ner = StanfordNERTagger(
            'C:/Users/1/James/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
            'C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar')

        for document in self.corpus[document_id]:
            for line in document:
                tags = ner.tag(' '.join(line))
                document = []
                for (word, tag) in tags:
                    if tag != 'O':
                        self.ner_corpus_tags.append(document.append(word))
        return self.ner_document_tags
Beispiel #28
0
class named_entity:
    def __init__(self):
        self.tagger = StanfordNERTagger(
            'stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
            'stanford-ner-2018-10-16/stanford-ner.jar',
            encoding='utf-8')
        #filename = "filename9500.pickle"
        #with open(filename,'rb') as handle:
        # _dic = pickle.load(handle)
        self.named_entity_count = {}

    def get_named_entity(self, tweet):
        per, loc, org = 0, 0, 0
        #print(tweet)
        text = tweet[1]
        tokenized_tweet = word_tokenize(text)
        taggs = self.tagger.tag(tokenized_tweet)

        _dic = {}
        _dic["PERSON"] = ""
        _dic["LOCATION"] = ""
        _dic["ORGANIZATION"] = ""

        for tag, chunk in groupby(taggs, lambda x: x[1]):
            ans = " ".join(w for w, t in chunk)
            if tag in _dic:
                _dic[tag] += " " + str(ans.encode('utf-8'))
                #print(ans)
                if str(tag) == "O":
                    continue
                if str(tag) == "PERSON":
                    per += 1
                elif str(tag) == "LOCATION":
                    loc += 1
                elif str(tag) == "ORGANIZATION":
                    org += 1
                if str(ans.encode('utf-8')).lower() in self.named_entity_count:
                    self.named_entity_count[str(
                        ans.encode('utf-8')).lower()] += 1
                else:
                    self.named_entity_count[str(
                        ans.encode('utf-8')).lower()] = 1

        entry = []
        entry.append(tweet[0])
        entry.append(tweet[2])
        entry.append(tweet[1])
        entry.extend([per, loc, org])
        return entry, _dic
def doc_ents(filename, f):
    ner = StanfordNERTagger(
        'C:/Users/1/James/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz',
        'C:/Users/1/James/stanford-ner-2015-12-09/stanford-ner.jar')

    document = []
    dd = {}
    with open(f, 'r') as myfile:
        text = myfile.readlines()

    for i, (word, tag) in enumerate(ner.tag(text)):
        if tag != 'O':
            document.append(word)
    dd[remove_extenstion(filename).title()] = set(document)
    return dd
Beispiel #30
0
def NER(text, tagName):
    import nltk
    from nltk.tag import StanfordNERTagger
    from nltk.tokenize import word_tokenize
    st = StanfordNERTagger('./english.all.3class.distsim.crf.ser.gz',
                           './stanford-ner.jar',
                           encoding='utf-8')
    for sent in nltk.sent_tokenize(text):
        tokens = nltk.tokenize.word_tokenize(sent)
        tags = st.tag(tokens)
        name = []
        for tag in tags:
            if tag[1] == tagName: name.append(tag[0])
        name = " ".join(name)
    return name
Beispiel #31
0
def namedEntityRecog(x):
	stanford_ner_tagger = StanfordNERTagger(
	'Documents/projects/praveenProject/sparkNLP/stanford_ner/classifiers/english.muc.7class.distsim.crf.ser.gz',
	'Documents/projects/praveenProject/sparkNLP/stanford_ner/stanford-ner-3.9.2.jar')
	results = stanford_ner_tagger.tag(x)
	#print('Original Sentence: %s' % (article))
	list1 = []
	for result in results:
		tag_value = result[0]
		tag_type = result[1]
		if tag_type != 'O':
			list1.append('Type: %s, Value: %s' % (tag_type, tag_value))

	filtered_list = [s for s in list1 if s!=""]
	return filtered_list
Beispiel #32
0
def get_NER(text):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser',
                           'stanford-ner.jar',
                           encoding='UTF8')
    tokens = word_tokenize(text)
    NER_text = st.tag(tokens)
    if os.path.exists('NER_log.txt'):
        f = open('NER_log.txt', 'w', encoding='UTF8')
    else:
        f = open('NER_log.txt', 'x', encoding='UTF8')
        f = open('NER_log.txt', 'w', encoding='UTF8')
    for i in NER_text:
        if i[1] != 'O':
            f.write(str(i))
    return NER_text
class NERClassifier(object):
    def __init__(self):
        from nltk.tag import StanfordNERTagger
        parentdir = osp.join(osp.abspath(osp.join(os.getcwd(), os.pardir)),
                             'QuoraQuestionPairs')
        jarfile = osp.join(parentdir, 'data', 'stanford-ner-2015-04-20',
                           'stanford-ner-3.5.2.jar')
        modelfile = osp.join(parentdir, 'data', 'stanford-ner-2015-04-20',
                             'classifiers',
                             'english.all.3class.distsim.crf.ser.gz')
        self.tagger = StanfordNERTagger(modelfile, path_to_jar=jarfile)

    def __call__(sentence):
        if isinstance(sentence, list):
            return self.tagger.tag()
def trial1():
    """
    Just to make sure we're not screwing everything up.
    :return:
    """
    st = StanfordNERTagger('/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/annotated-cities-model.ser.gz',
                           '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/stanford-ner.jar',
                           encoding='utf-8')

    text = 'While in France, Mrs. Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)

    print(classified_text)
Beispiel #35
0
def test_chinese_ner():
    # sent = u'小明硕士毕业于中国科学院计算所后在日本京都大学深造'
    sent = u'习近平于1978年至1982年在江苏工学院农业机械工程系农业机械专业学习,获工学学士学位'
    tokens = jieba.cut(sent)
    new_tokens = []
    for x in tokens:
        print x
        new_tokens.append(x)

    new_sent = u" ".join(new_tokens)
    print new_sent
    chi_tagger = StanfordNERTagger('chinese.misc.distsim.crf.ser.gz')
    ## sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
    for word, tag in chi_tagger.tag(new_sent.split()):
        print word.encode('utf-8'), tag
Beispiel #36
0
def stanford_ne_tagger(tokens):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)

    st._stanford_jar = ':'.join(stanford_jars)
    tags = st.tag(tokens)
    continuous_chunks = get_continuous_chunks(tags)
    named_entities_str_tag = set()
    for ne in continuous_chunks:
        if (ne[0][1] == u'LOCATION'):
            named_entities_str_tag.add(
                lower(u' '.join([token for token, tag in ne])))

    return named_entities_str_tag
Beispiel #37
0
def get_namedentities(text):
  """
  Returns named entities in text using StanfordNERTagger
  """
  st = StanfordNERTagger('utils/english.conll.4class.caseless.distsim.crf.ser.gz','utils/stanford-ner.jar')   
  ner_tagged = st.tag(text.lower().split())     
  
  named_entities = []
  if len(ner_tagged) > 0:
    for n in ner_tagged:
      if n[1]!='O':
        named_entities.append(remove_punctuation(n[0]))

  named_entities = [n for n in named_entities if n] 
  return named_entities
Beispiel #38
0
def get_relations(articles):
    relations = {
        'NODE1': [],
        'NODE2': [],
        'TYPE': [],
        'DATE': [],
        'SOURCEINTEXT': [],
        'SOURCE': [],
        'CONTEXT': []
    }
    gz_path = "stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz"
    jar_path = 'stanford-ner-2018-10-16/stanford-ner.jar'
    st = StanfordNERTagger(gz_path, jar_path, encoding='utf-8')
    for text in range(len(articles['TEXT'])):
        print('Analysing for sentiment and relations in:\n',
              articles['SOURCE'][text])
        sentences = sentence(articles['TEXT'][text])
        for sent in sentences:
            # sentiment analysis
            sentiment = TextBlob(sent).sentiment[0]
            if sentiment > 0:
                sentiment = 'positive'
            elif sentiment < 0:
                sentiment = 'negative'
            else:
                sentiment = 'neutral'
            # relation extraction
            tags = st.tag(sent.split())
            uniques = []
            for t in tags:
                if t not in uniques:
                    uniques.append(t)
            relate = combinations(grouptags(uniques), 2)
            # add to the output
            for r in relate:
                relations['NODE1'].append(strip_punct(r[0][1]))
                relations['NODE2'].append(strip_punct(r[1][1]))
                relations['TYPE'].append(sentiment)
                relations['DATE'].append(articles['Date'][text])
                try:
                    relations['SOURCEINTEXT'].append(','.join(
                        articles['Authors'][text]))
                except TypeError:
                    relations['SOURCEINTEXT'].append(None)
                relations['SOURCE'].append(articles['SOURCE'][text])
                relations['CONTEXT'].append(sent)
    print('Finished Analysing all news sources.')
    return relations
Beispiel #39
0
def stanford_entities(model,
                      jar,
                      fileids=None,
                      corpus=kddcorpus,
                      section=None):
    """
    Extract entities using the Stanford NER tagger.
    Must pass in the path to the tagging model and jar as downloaded from the
    Stanford Core NLP website.
    """
    results = defaultdict(lambda: defaultdict(list))
    fileids = fileids or corpus.fileids()
    tagger = StanfordNERTagger(model, jar)
    section = section

    for fileid in fileids:
        if section is not None:
            text = nltk.word_tokenize(
                list(sectpull([fileid], section=section))[0][1])
        else:
            text = corpus.words(fileid)

        chunk = []

        for token, tag in tagger.tag(text):
            if tag == 'O':
                if chunk:
                    # Flush the current chunk
                    etext = " ".join([c[0] for c in chunk])
                    etag = chunk[0][1]
                    chunk = []

                    if etag == 'PERSON':
                        key = 'persons'
                    elif etag == 'ORGANIZATION':
                        key = 'organizations'
                    elif etag == 'LOCATION':
                        key = 'locations'
                    else:
                        key = 'other'

                    results[fileid][key].append(etext)

            else:
                # Build chunk from tags
                chunk.append((token, tag))

    return results
Beispiel #40
0
class StanfordNER(NER):
    def __init__(self, path=None):
        if path is None:
            path = os.path.dirname(os.path.realpath(__file__)) + '/'
        self.ner = StanfordNERTagger(
            path +
            'stanford/classifiers/english.all.3class.distsim.crf.ser.gz',
            path + 'stanford/stanford-ner.jar',
            encoding='utf-8')

        self.set = NER.allowed_tags

    def tag(self, text, language=None, **kwargs):
        tokenized_text = word_tokenize(text)
        classified_text = self.ner.tag(tokenized_text)
        return classified_text
Beispiel #41
0
class Dictionary:
    def __init__(self):
        self.dictionary = PyDictionary()
        self.st = StanfordNERTagger(
            'D:\Python\stanford-ner-2016-10-31\classifiers\english.all.3class.distsim.crf.ser.gz',
            'D:\Python\stanford-ner-2016-10-31\stanford-ner.jar')

    def getMeaning(self, word):
        tokenized_text = word_tokenize(word)
        classified_text = self.st.tag(tokenized_text)

        print(classified_text)

        for i in classified_text:
            if (i[1] == 'WORD'):
                return self.dictionary.meaning(i[0])
def classify_text(text):
    """Using the 3-class Stanford Named Entity Recognition model, classify each
       word in the input text as a PERSON, LOCATION, ORGANIZATION, or O (for
       other)."""

    directory = "C:/Users/liabbott/Documents/Projects/CBP OIT/stanford_ner/"
    mod = "classifiers/english.all.3class.distsim.crf.ser.gz"
    tag = "stanford-ner.jar"
    path_to_model = os.path.normpath(directory + mod)
    path_to_tagger = os.path.normpath(directory + tag)
    st = StanfordNERTagger(path_to_model, path_to_tagger, encoding='utf-8')

    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)

    return classified_text
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None):
    """
    Extract entities using the Stanford NER tagger.
    Must pass in the path to the tagging model and jar as downloaded from the
    Stanford Core NLP website.
    """
    results = defaultdict(lambda: defaultdict(list))
    fileids = fileids or corpus.fileids()
    tagger  = StanfordNERTagger(model, jar)
    section = section

    for fileid in fileids:
        if section is not None:
            text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1])
        else:
            text  = corpus.words(fileid)

        chunk = []

        for token, tag in tagger.tag(text):
            if tag == 'O':
                if chunk:
                    # Flush the current chunk
                    etext =  " ".join([c[0] for c in chunk])
                    etag  = chunk[0][1]
                    chunk = []

                    # if etag == 'PERSON':
                    #     key = 'persons'
                    # elif etag == 'ORGANIZATION':
                    #     key = 'organizations'
                    # elif etag == 'LOCATION':
                    #     key = 'locations'
                    # else:
                    #     key = 'other'

                    if etag == 'LOCATION':
                        key = 'locations'
                    else:
                        key = 'other'
                    results[fileid][key].append(etext)

            else:
                # Build chunk from tags
                chunk.append((token, tag))

    return results
Beispiel #44
0
def html_ner(content):
    st = StanfordNERTagger(
        './lib/classifiers/english.all.3class.distsim.crf.ser.gz',
        './lib/stanford-ner-3.5.2.jar')
    soup = BeautifulSoup(content, "html.parser")
    for script in soup(["script", "style", "sup"]):
        script.extract()
    tokenised_sents = list(soup.stripped_strings)
    tokenised_words = [wordpunct_tokenize(sent) for sent in tokenised_sents]
    tagged_sents = [st.tag(sent) for sent in tokenised_words]

    result = list()

    for sent in tagged_sents:
        for tag, chunk in groupby(sent, lambda x: x[1]):
            if tag != 'O':
                result.append((tag, ' '.join(w for w, t in chunk).encode('utf-8').strip()))
    return result
Beispiel #45
0
    def sanitize_result(self, text):
        
        
        st = StanfordNERTagger('C:\Python27\stanford_ner\classifiers\english.all.3class.distsim.crf.ser.gz',
                                                   'C:\Python27\stanford_ner\stanford-ner.jar',
                                                   encoding='utf-8')
        tokenized_text = word_tokenize(self.capitalize_first_letter(text))
        classified_text = st.tag(tokenized_text)

        named_entities = self.get_continuous_chunks(classified_text)
        named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
        named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]


        for tag, chunk in groupby(named_entities_str_tag, lambda x:x[1]):
            if tag == "PERSON":
                #print "%-12s"%tag, " ".join(w for w, t in chunk)
                name = " ".join(w for w, t in chunk)
               
        return name
def main():

    # training standford NER tagger

    st = StanfordNERTagger(
        "/home/viswanath/Downloads/stanford-ner-2014-08-27/classifiers/english.conll.4class.distsim.crf.ser.gz",
        "/home/viswanath/Downloads/stanford-ner-2014-08-27/stanford-ner.jar",
        encoding="utf-8",
    )

    fname = "/home/viswanath/data/resume/test_data/01.txt"
    fp = open(fname, "r")
    text = fp.read()
    #  print text
    lstemp = cleanse_data(text)
    list_ner_out = st.tag(lstemp.split())
    #   list_ner_out = st.tag(text.split())
    #   print list_ner_out
    # list_out = st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

    fp = open("ner_temp.txt", "w")
    #    fp.write(list_ner_out)
    for item in list_ner_out:
        fp.write("{0}\n".format(item))
    fp.close()

    ne_tagged_sent = list_ner_out

    ne_tree = stanfordNE2tree(ne_tagged_sent)

    print ne_tree

    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == Tree:  # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    print ne_in_sent
Beispiel #47
0

conn = psycopg2.connect(database="NewsSource", user="******", password="******", host="newdb.cnceaogjppz8.us-west-2.rds.amazonaws.com",port="5432")
cur = conn.cursor()
#cur.execute("select articletext,id from articlestable  order by crawleddatetime desc")

#cur.execute("select articletext,id from articlestable where order by crawleddatetime desc")
updateq = "update articlestable set wikilinks= %s where id=%s"
while True:
	cur.execute("select articletext,id from articlestable where COALESCE(wikilinks,'') = '' order by crawleddatetime desc limit 10")
	#cur.execute("select articletext,id from articlestable order by crawleddatetime desc limit 150")
	rows = cur.fetchall()
	for row in rows:
		print row[0]
		print 
		py = st.tag(row[0].split())	
		ne_tagged_sent = py

		named_entities = get_continuous_chunks(ne_tagged_sent)
		named_entities = get_continuous_chunks(ne_tagged_sent)
		named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
		named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]

		ner = named_entities_str_tag
		people =[]
		places =[]
		organ =[]

		print named_entities_str_tag
		print
		for entity in ner:
Beispiel #48
0
class PhotoStream(object):
	def __init__(self):
		self.store_list= []
		self.st = StanfordNERTagger(r'english.all.3class.nodistsim.crf.ser.gz',r'stanford-ner.jar')

	def request_pop_photo_stream(self, category, attempts):
		if attempts == 3:
			return None
		payload = {'consumer_key':CONSUMER_KEY,'rpp':100,'feature':'popular','only':category,'tags':1, 'sort':'rating','image_size':3}
		results = []
		for page in range(1,6):
			payload['page'] = page
			try:
				response = requests.get('https://api.500px.com/v1/photos', params=payload)
				result_page = response.json()
				results.extend(result_page['photos'])
			except:
				self.request_pop_photo_stream(category, attempts+1)
		return results

	def get_located_photos(self, photo_list, category):	
		for photo in photo_list:
			if photo_manager.photo_seen(photo['id']):
				self.store_list.append(photo)
			else:
				photo_manager.add_photo(photo['id'])
				if photo['latitude'] is not None:
					photo['exact location'] = True
					self.store_list.append(photo)
				elif category != 'People':
					self.extract_location(photo,category)	
		return self.store_list

	def extract_location(self, photo, category):
		tags = photo['tags']
		possible_locations = []
		tag_pos_array = []
		pos = 0
		total_tags = ''
		for tag in tags:
			tag = tag.title()
			if tag in location_manager.known_locations:
				possible_locations.append(tag)
			elif tag in location_manager.not_locations:
				continue
			elif tag not in EXCLUDED_LOCATIONS:
				separated_subtags = tag.split()
				#record the end position of a tag in total_tags
				pos = pos+len(separated_subtags)
				tag_pos_array.append(pos)
				total_tags = total_tags+tag+' '
		total_tags = total_tags[:-1]
		possible_locations.extend(self.nlp_analyze(total_tags, tag_pos_array))
		
		possible_locations = [loc for loc in possible_locations if loc not in EXCLUDED_LOCATIONS]
		if len(possible_locations) == 0:
			return None
		location_dic = {}
		for i in range(len(possible_locations)):
			if possible_locations[i] in location_dic:
				location_dic[possible_locations[i]] = location_dic[possible_locations[i]]+1
			else:
				location_dic[possible_locations[i]] = 1
		sorted_location = sorted(location_dic.items(),key=operator.itemgetter(1))
		sorted_location.reverse()
		#use the location most frequently appeared in tags
		location = sorted_location[0]
		#print photo['name']
		#print location[0]
		#print "\n"
		lat, lng = self.request_latlng(location[0], category)
		if lat is not None:
			photo['latitude'] = lat
			photo['longitude'] = lng
			photo['exact location'] = False
			self.store_list.append(photo)

	def nlp_analyze(self, text, tag_pos_array):
		if text is None or text == "":
			return []
		possible_locations = []
		splitted_text = text.split()
		total_length = len(splitted_text)
		result = self.st.tag(splitted_text)
		start = 0
		for i in range(0,len(tag_pos_array)):
			end = tag_pos_array[i]
			conseq = False
			loc_tmp = ""
			for j in range(start,end):
				if result[j][1] == 'LOCATION':
					if conseq:
						loc_tmp = loc_tmp+' '+result[j][0]
					else:
						loc_tmp = result[j][0]
						conseq = True
				else:
					if loc_tmp != "":
						location_manager.known_locations.add(loc_tmp)
						possible_locations.append(loc_tmp)
					loc_tmp = ""
					conseq = False
					location_manager.not_locations.add(result[j][0])
			if loc_tmp != "":
				possible_locations.append(loc_tmp)
			start = end

		return possible_locations

	def request_latlng(self, location, category):
		#use three geonames.org accounts to avoid requesting limitations
		geoname_account = GEONAME_ACCOUNTS[PHOTO_CATEGORIES.index(category)]
		payload = {'q':location,'maxRows':1,'username':geoname_account}
		try:
			response = requests.get('http://api.geonames.org/searchJSON', params=payload)
			result = response.json()
			return float(result['geonames'][0]['lat']), float(result['geonames'][0]['lng'])
		except:
			return None, None
		
	def save_photo_stream_to_db(self, photo_list, category):
		if category == 'City and Architecture':
			photo_collection = mydb.city
		elif category == 'Landscapes':
			photo_collection = mydb.landscape
		elif category == 'People':
			photo_collection = mydb.people
		for photo in photo_list:
			#check if the photo is already in the database
			if photo_collection.find_one({'id':photo['id']}) is None:
				if photo['latitude'] is not None:
					photo_collection.insert(photo)
			else:
				photo_collection.update_one({'id':photo['id']},{
	        		"$set": {
	            	"rating": photo['rating']
	        		}
	    			}
				)
Beispiel #49
0
	StanfordNERPath = './stanford-ner'
	st = StanfordNERTagger(StanfordNERPath + '/classifiers/english.all.3class.distsim.crf.ser.gz', StanfordNERPath + '/stanford-ner.jar')

	indicator = set([u'is', u'was', u'are', u'were'])
	noun_tag = set(['NN', 'NNS', 'NNP', 'NNPS'])

	q = open("withDoc.rq").read()
	results = G.query(q)
	outfile = open("../output/stadium.tsv", "w")
	count = 0
	for row in results:
		count += 1
		if count % 10 == 0:
			print "%d documents has been processed" % count
		text = row[1]
		tags = st.tag(word_tokenize(text))
		firstSentence = ''
		for j in range(1, len(text)):
			if (text[j] == '.') and ((j+1==len(text) or (text[j+1] in string.whitespace))):
				if text[j-1] not in string.uppercase:
					firstSentence = text[:j]
					break
		if firstSentence == '':
			firstSentence = text.split('.')[0]
		postags = pos_tag(word_tokenize(firstSentence))
		posLen = len(postags)
		noun = ''
		idx = 0
		for j in range(posLen):
			if postags[j][0] in indicator:
				idx = j
	def namedEntityRecognize(self, sentence):
	#perform NER on the sentence - returns a list of tuples of (word, ne-recognized tags)
		st = StanfordNERTagger(self.modelPath)
		print st.tag(sentence.split())
		return st.tag(sentence.split())
Beispiel #51
0
def get_entities(content):
    st = StanfordNERTagger('C:\Users\Philippe\Downloads\stanford-ner-2015-04-20\stanford-ner-2015-04-20\classifiers\english.all.3class.distsim.crf.ser.gz')
    entity_list = st.tag(content.split())

    return entity_list
Beispiel #52
0
#from code_classifier_chunker import ConsecutiveNPChunker
jar_folder = "/Users/nishantagarwal/stanford-ner-2015-04-20/stanford-ner.jar"
os.environ['CLASSPATH']=jar_folder

from nltk.tag import StanfordNERTagger
path_to_jar = "/Users/nishantagarwal/stanford-ner-2015-04-20/stanford-ner.jar"
path_to_model = "/Users/nishantagarwal/stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz"#english.muc.7class.distsim.crf.ser.gz"
#train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
#test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
#chunker = ConsecutiveNPChunker(train_sents)
l = 'When did the boys realize they were lost?'
#print chunker.parse(l)
grammar = "NP: {<LOCATION><O>?<LOCATION>+}"
qetag = StanfordNERTagger(path_to_model,path_to_jar)
l = nltk.word_tokenize(l)
ne_tagged = qetag.tag(l)
tagged =  nltk.pos_tag(l)
print l
print tagged
print ne_tagged
cp = nltk.RegexpParser(grammar)
tre = cp.parse(ne_tagged)
for subtree in tre.subtrees():
	if subtree.label() == 'NP':
		break
#print subtree.leaves()
#print(chunker.evaluate(test_sents))
#subtree = subtree.leaves()
answer = ' '.join(word for word,pos in subtree.leaves())
print answer
#qetag = StanfordNERTagger(path_to_model,path_to_jar)
Beispiel #53
0
A big benefit of the Stanford NER tagger is that is provides us with a few different models for pulling out named entities. We can use any of the following:

3 class model for recognizing locations, persons, and organizations
4 class model for recognizing locations, persons, organizations, and miscellaneous entities
7 class model for recognizing locations, persons, organizations, times, money, percents, and dates


################################################################################################

The parameters passed to the StanfordNERTagger class include:

Classification model path (3 class model used below)
Stanford tagger jar file path
Training data encoding (default of ASCII)

"""

from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
					   '/usr/share/stanford-ner/stanford-ner.jar',
					   encoding='utf-8')

text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)

print(classified_text)
Beispiel #54
0
from nltk.tag import StanfordNERTagger
from nltk import word_tokenize
import string
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')

outfile = open("../output/entity.txt", "w")
doc_count = 826
entitySet = {}
print "Stage 1: NER starts!"
for i in range(doc_count):
	infile = open("../data/%d.txt" % i)
	tags = st.tag(word_tokenize(unicode(infile.read())))
	infile.close()
	transform_file = open("../data/transformed_%d.txt" % i, "w")
	entity = u''
	label = u''
	transform_text = u''
	if i % 10 == 0:
		print "%d files have been processed." % i 
	for tag in tags:
		if tag[1] == u'O':
			if label != u'':
				if not entitySet.has_key((entity, label)):
					entitySet[(entity, label)] = len(entitySet)
					outfile.write('%s%d\t%s\n' % (label, entitySet[(entity, label)], entity))
				transform_text += ' %s%d' % (label, entitySet[(entity, label)])
			transform_text += ' ' + tag[0]
Beispiel #55
0
from nltk.tag import StanfordNERTagger

st = StanfordNERTagger("english.all.3class.distsim.crf.ser.gz")
print st.tag(
    "Rami Eid is studying at Stony Brook University in NY. And he wants to work at CERN in Switzerland in Europe .".split()
)
Beispiel #56
0
class StanfordTagger(object):
    """
    Wrapper for the Stanford NER Tagger
    """
    __currentDirectory = os.path.dirname(os.path.realpath(__file__)) # Current directory
    __classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz"
    __stanfordJar = "%s/dist/stanford-ner.jar"

    def __init__(self, language="en"):
        from nltk.tag import StanfordNERTagger

        self.__stanfordJar = "%s/dist/stanford-ner.jar" % self.__currentDirectory
        self.__classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" % (self.__currentDirectory,)
        self.__tagger = StanfordNERTagger( self.__classifier,
                                           self.__stanfordJar,
                                           encoding="utf-8")
        self.__namedEntitiesFinder = NERFinder(language=language)

    def __tags(self, raw_text):
        """
        Return the named entities tokens given a raw text
        :raw_text: Raw text
        """
        from nltk.tokenize import word_tokenize

        if isinstance(raw_text, str):
            # Decode to utf-8
            raw_text = raw_text.decode('utf-8')
        # Tokenize the string
        token_text = word_tokenize(raw_text)
        # Retrieve the named entities from the tokens
        ne_tags = self.__tagger.tag(token_text)
        return(ne_tags)

    def __bio_tagger(self, ne_tagged):
        """
        Return BIO tags from named entities
        :ne_tagged: name_entities tokens
        """
        bio_tagged = []
        prev_tag = "O"
        for token, tag in ne_tagged:
            if tag == "O": #O
                bio_tagged.append((token, tag))
                prev_tag = tag
                continue
            if tag != "O" and prev_tag == "O": # Begin NE
                bio_tagged.append((token, "B-"+tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag == tag: # Inside NE
                bio_tagged.append((token, "I-"+tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
                bio_tagged.append((token, "B-"+tag))
                prev_tag = tag
        return bio_tagged

    def __generate_tree(self, bio_tagged):
        """
        Tranform a list of tags in a tree
        """
        from nltk import pos_tag
        from nltk.chunk import conlltags2tree


        tokens, ne_tags = zip(*bio_tagged)
        pos_tags = [pos for token, pos in pos_tag(tokens)]

        conlltags = [(token, pos, ne) for token, pos, ne in zip(tokens, pos_tags, ne_tags)]
        ne_tree = conlltags2tree(conlltags)
        return ne_tree

    def __getEntities(self, taggedWords):
        """
        It returns the entities from a list of tagged words (NER or POS) after generating the syntax tree
        """
        bio_tagged = self.__bio_tagger(taggedWords)
        stanford_tree = self.__generate_tree(bio_tagged=bio_tagged)

        entities = self.__namedEntitiesFinder.getEntities(stanford_tree)
        return entities

    def getEntitiesByTags(self, pos_tagged_words):
        """
        Get entities from a list of word tagged with POS Tags.
        """
        entities = self.__getEntities(taggedWords=pos_tagged_words)
        return entities

    def getEntities(self, raw_text):
        """
        Get the entities from a raw text
        """
        ne_entities = self.__tags(raw_text=raw_text)
        entities = self.__getEntities(taggedWords=ne_entities)
        return entities
class PhotoStream(object):
    def __init__(self):
        self.client = api.FiveHundredPx(CONSUMER_KEY, CONSUMER_SECRET)
        self.store_list = []
        self.st = StanfordNERTagger(r"english.all.3class.nodistsim.crf.ser.gz", r"stanford-ner.jar")

    def request_pop_photo_stream(self, category, attempts):
        if attempts == 3:
            return None
        results = None
        try:
            results = self.client.get_photos(rpp=2, feature="popular", only=category, sort="rating", tags=1)
        except:
            self.request_pop_photo_stream(category, attempts + 1)
        return results

    def get_located_photos(self, photo_list, category):
        for photo in photo_list:
            if photo_manager.photo_seen(photo["id"]):
                self.store_list.append(photo)
            else:
                photo_manager.add_photo(photo["id"])
                if photo["latitude"] is not None:
                    photo["exact location"] = True
                    self.store_list.append(photo)
                elif category != "People":
                    self.extract_location(photo, category)
        return self.store_list

    def extract_location(self, photo, category):
        tags = photo["tags"]
        possible_locations = []
        q = Queue()
        job_list = []
        for tag in tags:
            tag = tag.title()
            if tag in location_manager.known_locations:
                possible_locations.append(tag)
            elif tag in location_manager.not_locations:
                continue
            elif tag not in EXCLUDED_LOCATIONS:
                p = Process(target=self.nlp_analyze, args=(tag, q))
                job_list.append(p)
                p.start()
        for job in job_list:
            job.join()
        while not q.empty():
            possible_locations.extend(q.get())

        if len(possible_locations) == 0:
            return None
        location_dic = {}
        for i in range(len(possible_locations)):
            if possible_locations[i] in location_dic:
                location_dic[possible_locations[i]] = location_dic[possible_locations[i]] + 1
            else:
                location_dic[possible_locations[i]] = 1
        sorted_location = sorted(location_dic.items(), key=operator.itemgetter(1))
        sorted_location.reverse()
        # use the location most frequently appeared in tags
        location = sorted_location[0]
        print photo["name"]
        print location[0]
        print "\n"
        lat, lng = self.request_latlng(location[0], category)
        if lat is not None:
            photo["latitude"] = lat
            photo["longitude"] = lng
            photo["exact location"] = False
            self.store_list.append(photo)

    def nlp_analyze(self, text, q):
        if text is None or text == "":
            return []
        possible_locations = []
        result = self.st.tag(text.split())
        loc_tmp = ""
        conseq = False
        for r in result:
            if r[1] == "LOCATION":
                if conseq:
                    loc_tmp = loc_tmp + " " + r[0]
                else:
                    loc_tmp = r[0]
                    conseq = True
            else:
                if loc_tmp != "":
                    location_manager.known_locations.add(loc_tmp)
                    possible_locations.append(loc_tmp)
                loc_tmp = ""
                conseq = False
                location_manager.not_locations.add(r[0])
        if loc_tmp != "":
            possible_locations.append(loc_tmp)
        q.put(possible_locations)

    def request_latlng(self, location, category):
        # use three geonames.org accounts to avoid requesting limitations
        geoname_account = GEONAME_ACCOUNTS[PHOTO_CATEGORIES.index(category)]
        payload = {"q": location, "maxRows": 1, "username": geoname_account}
        try:
            response = requests.get("http://api.geonames.org/searchJSON", params=payload)
            result = response.json()
            return float(result["geonames"][0]["lat"]), float(result["geonames"][0]["lng"])
        except:
            return None, None

    def save_photo_stream_to_db(self, photo_list, category):
        if category == "City and Architecture":
            photo_collection = mydb.city
        elif category == "Landscapes":
            photo_collection = mydb.landscape
        elif category == "People":
            photo_collection = mydb.people
        for photo in photo_list:
            # check if the photo is already in the database
            if photo_collection.find_one({"id": photo["id"]}) is None:
                if photo["latitude"] is not None:
                    photo_collection.insert(photo)
            else:
                photo_collection.update_one(
                    {"id": photo["id"]}, {"$set": {"rating": photo["rating"]}, "$currentDate": {"lastModified": True}}
                )
Beispiel #58
0
import ner
from nltk.tag import StanfordNERTagger

stanford_ner_dir = '/home/will/packages/stanfordNER/'
eng_model_filename = stanford_ner_dir + 'classifiers/english.all.3class.distsim.crf.ser.gz'
my_path_to_jar = stanford_ner_dir + 'stanford-ner.jar'

st = StanfordNERTagger(model_filename=eng_model_filename, path_to_jar=my_path_to_jar)
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())

# tagger = ner.HttpNER(host='localhost', port=8080)
# tagger.get_entities("University of California is located in California, United States")
Beispiel #59
0
def main(argv):
    # Opening file to read and file to write, also preparing StanfordNERTagger.
    st = StanfordNERTagger('/home/sietse/Desktop/Project Tekstanalyse/Project/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz')
    f = open("{0}/en.tok.off.pos".format(argv[1])).readlines()
    g = open("en.tok.off.pos.ent.{0}.{1}".format(argv[1][-9:-6],argv[1][-5:]), "w")
    context = []
    lines = []
    text = []
    sets = []
    bigrams = []
    lines_updated = []
    NER_tags = ["COU", "CIT", "NAT", "PER", "ORG", "ANI", "SPO", "ENT"]
    # Creating context for lesk function and NER- and POS-tagging words.
    print("[Tagging words...]")
    for line in f:
        line = line.split()
        lines.append(line)
        context.append(line[3])
        text.append((line[3],line[4]))  
    chunk = nltk.ne_chunk(text)
    stanford = st.tag(context)
    # Here the words will be made ready to be written to the output file (NER-tag added to the line).
    print("[Applying tags to lines...]")  
    for line in f:
        lemmas = []
        names = []
        line = line.split()
        for i in stanford:
            if line[4] in "NNPS" or "NNS":
                if line[3] in i:
                    if i[1] == "PERSON":
                        line.append("PER")
                    if i[1] == "ORGANIZATION":
                        line.append("ORG")
                    if i[1] == "LOCATION":
                        line.append("LOCATION")
        if len(wordnet.synsets(line[3], 'n')) == 0:
            hyper = []
        if len(wordnet.synsets(line[3], 'n')) == 1:
            synset = wordnet.synsets(line[3], 'n')[0]
            hyper = synset.hypernym_paths()
        if len(wordnet.synsets(line[3], 'n')) > 1:
            synset = lesk(context,line[3],'n')
            hyper = synset.hypernym_paths()
        for i in hyper:
            for e in i:
                lemmas.append(e.lemmas())
        for i in lemmas:
            for e in i:
                names.append(e.name())
        if "country" in names and line[3][0].isupper():
            line.append("COU")
        elif "government" in names and line[3][0].isupper():
            line.append("COU")
        elif "province" in names and line[3][0].isupper():
            line.append("COU")
        elif "state" in names and line[3][0].isupper():
            line.append("COU")   
        elif "city" in names and line[3][0].isupper():
            line.append("CIT")
        elif "sport" in names:
            line.append("SPO")
        elif "animal" in names:
            line.append("ANI")
        elif "entertainment" in names and line[3][0].isupper():
            line.append("ENT") 
        elif "amusement" in names and line[3][0].isupper():
            line.append("ENT")
        elif "island" in names:
            line.append("NAT") 
        elif "water" in names:
            line.append("NAT")
        elif "mountain" in names:
            line.append("NAT")  
        if "LOCATION" in line and not "COU" in line:
            line.append("CIT")
            line.remove("LOCATION")
            if "LOCATION" in line:
                line.remove("LOCATION")
        if "LOCATION" in line and "COU" in line:
            line.remove("LOCATION")
            if "LOCATION" in line:
                line.remove("LOCATION")
        lines_updated.append(line)
    # Bigrams are put in a list to make urls more precise.
    for i,j in zip(lines_updated,lines_updated[1:]): 
        if 0 <= 5 < len(i) and 0 <= 5 < len(j):
            if i[5] in NER_tags and j[5] in NER_tags:          
                bigrams.append(i[3]+" "+j[3])
    # Words are assigned Wikipedia pages and then the whole is written to the output file.
    print("[Attaching Wikipedia urls and writing output file...]")
    for line in lines_updated:
        try:
            if 0 <= 5 < len(line):
                if bigrams:
                    for i in bigrams:
                        if line[3] in i:
                            page = wikipedia.page(i)
                            break
                        else:
                            page = wikipedia.page(line[3])
                else: 
                    page = wikipedia.page(line[3])
                if len(line) >= 6:
                    g.write("{0} {1} {2} {3} {4} {5} {6}\n".format(line[0], line[1], line[2], line[3], line[4], line[5], page.url))
                else:
                    g.write("{0} {1} {2} {3} {4}\n".format(line[0], line[1], line[2], line[3], line[4]))
            else:
                g.write("{0} {1} {2} {3} {4}\n".format(line[0], line[1], line[2], line[3], line[4]))
        # If the wikipedia module does not know what url to attach, a custom url will be added which (probably) leads to a disambiguation page.
        except wikipedia.exceptions.DisambiguationError:
            for i in bigrams:
                if line[3] in i:
                    page = "https://en.wikipedia.org/wiki/{0}".format(i.replace(" ", "_"))
                    break
                else:
                    page = "https://en.wikipedia.org/wiki/{0}".format(line[3])       
            if len(line) >= 6:
                g.write("{0} {1} {2} {3} {4} {5} {6}\n".format(line[0], line[1], line[2], line[3], line[4], line[5], page))
            else:
                g.write("{0} {1} {2} {3} {4}\n".format(line[0], line[1], line[2], line[3], line[4]))
            continue
        # If another error occurs, the line will be written to the file with or without NER-tag and without url.
        except wikipedia.exceptions.PageError:
            if len(line) >= 5:
                g.write("{0} {1} {2} {3} {4} {5}\n".format(line[0], line[1], line[2], line[3], line[4], line[5]))
            else:
                g.write("{0} {1} {2} {3} {4}\n".format(line[0], line[1], line[2], line[3], line[4]))
            continue
    print("[Processed file...]")
Beispiel #60
0
class getWikiInfo:
#retrieves the family from wiki text and infoboxes
	

	def __init__(self, person):
		
		self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
		classifier = "ner/classifiers/" + "english.all.3class.distsim.crf.ser.gz"
		jar = "ner/stanford-ner-3.4.jar"
		self.tagger = StanfordNERTagger(classifier, jar)
		self.ap = []
		self.person = person
		self.query = Sparql(person)
		self.setSpouse()
		self.setMother()
		self.setFather()
		self.setFullName()
		self.setAbstract()
		self.setAbstractInfo()

	def setAbstractInfo(self):
		try:
			conObj = wikipedia.page(self.person)
			content = conObj.content
		except wikipedia.exceptions.DisambiguationError as e:
			content = None
		except wikipedia.exceptions.PageError as e:
			content = None
		
		

		if content:
			for sentence in self.tokenizer.tokenize(content):
				if 'daughter of' in sentence:
					sentence = sentence[sentence.find('daughter of'):]
				elif 'son of' in sentence:
					sentence = sentence[sentence.find('son of'):]
				elif 'child of' in sentence:
					sentence = sentence[sentence.find('child of'):]
				else:
					sentence = False


				# daughter of, son of, child of
				if sentence is not False:
					person = ''
					for tag in self.tagger.tag(sentence.split()):
							
						if tag[1] == 'PERSON':
							person = person + " " + tag[0]
						else:
							if not person == '':
								self.ap.append(person)
							person = ''

		

	def getAbstractParents(self):
		if len(self.ap) > 0:
			return ", " .join(set(self.ap))
		else:
			return "Unknown"


	def setSpouse(self):
		if 'spouse' in self.query.result:
			self.spouse = list(self.query.result['spouse'])
		else:
			self.spouse = ['Unknown', 'literal']	
	
	def setMother(self):
		if 'mother' in self.query.result:
			self.mother = list(self.query.result['mother'])
		else:
			self.mother = ['Unknown', 'literal']
	
	def setFather(self):
		if 'father' in self.query.result:
			self.father = list(self.query.result['father'])
		else:
			self.father = ['Unknown', 'literal']

	def setFullName(self):
		if 'fullName' in self.query.result:
			self.fullName = list(self.query.result['fullName'])
		else:
			self.fullName = [self.person, 'literal']	

	def setAbstract(self):
		if 'abstract' in self.query.result:
			self.abstract = list(self.query.result['abstract'])
		else:
			self.abstract = ['Unknown', 'literal']	
		
	def getSpouse(self):
		return self.spouse
	
	def getMother(self):
		return self.mother

	def getFather(self):
		return self.father

	def getFullName(self):
		return self.fullName

	def getAbstract(self):
		return self.abstract