def SampleResource(): epoch = time.time() args = request.args.get('query') try: wiki = wikiapi.WikiApi() results = wiki.find(args) article = wiki.get_article(results[0]) k = [] tagger = ner.SocketNER(host='localhost', port=17017) sent_tokenizer = nltk.tokenize.PunktSentenceTokenizer() sentences = sent_tokenizer.tokenize(article.content.encode("utf-8")) print sentences data = return_data(article, sentences) print time.time() - epoch for sentence in sentences: j = tagger.get_entities(sentence) if j.get("LOCATION") is not None: k = k + j.get("LOCATION") if j.get("ORGANIZATION") is not None: k = k + j.get("ORGANIZATION") if j.get("PERSON") is not None: k = k + j.get("PERSON") return jsonify({"data": data[0], "data_head": {"data_tag": data[1], "image_src": article.image}, "tags": list(set(k)), "error": False, "success": True,}) except wikipedia.DisambiguationError as e: return jsonify({"data": e.__str__(), "error": True}) except IndexError: return jsonify({"data": None, "error": True, "messege": "We dont have anything yet, related to %s"%args })
def test(self, corpus, port=9191): self.tagger = ner.SocketNER("localhost", port, output_format='inlineXML') tagged_sentences = [] logging.info("sending sentences to tagger {}...".format(self.path)) for isent, sid in enumerate(self.sids): print("Aditi " + str(sid)) #out = self.tagger.tag_text(replace_abbreviations(" ".join([t.text for t in self.tokens[isent]]))) #out = self.tagger.tag_text(self.sentences[isent]) #text = self.sentences[isent] text = " ".join([t.text for t in self.tokens[isent]]) #logging.info("tagging: {}/{} - {}={}".format(isent, len(self.sids), sid, did)) try: out = self.tagger.tag_text(text) print(text) except SocketError as e: if e.errno != errno.ECONNRESET: raise # Not error we are looking for print("socket error with sentence {}".format(text)) except: print("other socket error!") out = self.tagger.tag_text(text) #print text, out #out = text tagged_sentences.append(out) print(tagged_sentences) results = self.process_results(tagged_sentences, corpus) return results
def get_ner_tagged_text(df): tagger = ner.SocketNER(host='localhost', port=8080) txt_tagged = [] i = 1 time_start = datetime.now() for txt in df.header.map(str) + df.body.map(str): try: txt_tagged.append(tagger.get_entities(txt)) if i % 100 == 0: print 'Working on item:', i, '\tof', len(df) df_temp = df.ix[range(i)] df_temp['ner'] = txt_tagged df_temp = get_sentiment_score(df_temp) triplet = get_entity_sentment_triplet(df_temp) write_triplet_to_file(triplet) print 'Time remaining:', (datetime.now() - time_start) / i * (len(df) - i) except: txt_tagged.append('Error while tagging') i = i + 1 #print 'Tagged:', tagger.get_entities(txt) df['ner'] = txt_tagged print 'NER Finished' #df_ner= pandas.io.json.read_json(txt_tagged) return df
def produceNamedEntity(list_sent): """ input: list of sentences in the file return: list of all the named entities in the file """ # lmtzr = WordNetLemmatizer() pyner_tagger = ner.SocketNER(host='localhost', port=8080) list_ne = [] for sent in list_sent: if len(sent.strip()) != 0: sent_ne_dict = pyner_tagger.get_entities(sent) sent_ne_dict_len = len(sent_ne_dict.items()) if sent_ne_dict_len != 0: for entity, list_word in sent_ne_dict.items(): # print entity, list_word for word in list_word: if len(word) > 2: list_ne.append( str(word.lower()) + ':' + str(entity)) return list_ne
def locations_tag(directory): """ Finds location terms in all text files in a given directory Input: directory - string representing the local directory to analyze Output: locations - dictionary mapping each file containing location terms to the terms """ locations = {} tagger = ner.SocketNER(host='localhost', port=8080) subs = { '\n': '. ', 'co.': 'County', 'Co.': 'County', 'county': 'County', 'A.T.': 'Arkansas', 'M.T.': 'Mississippi' } for filename in os.listdir(directory): if filename.endswith(".txt"): with open(os.path.join(directory, filename), 'r') as f: text = f.read().decode("utf8") text = preprocess(text, subs) entities = tagger.get_entities(text) if 'LOCATION' in entities: locs = merge_locations(entities['LOCATION'], text) locations[filename] = locs return locations
def map_NER_per_page(page_text, NER_port=2020): ''' Uses Stanford's NER to create a map of entities contained in the text. arguments: page_text: (string) the page's full text NER_port: (int) port the NER server runs on. defaults to 2020 ''' NER_socket = ner.SocketNER(host="localhost", port=NER_port) try: result = NER_socket.get_entities(page_text) entities = {} for entity_type in [str(key) for key in result.keys()]: entities[entity_type] = list( set([str(entity) for entity in result[entity_type]])) return entities except Exception as e: print "map_NER_per_page ERROR" print e, type(e) return None
def extract_entities(text): print text tagger = ner.SocketNER(host='localhost', port=8080) tags = tagger.get_entities(text) if 'PERSON' in tags.keys(): print tags['PERSON'] print return set(tags['PERSON']) print return set([])
def __init__(self, example_path, test_file_path_list, enable_saving=False, n_gram=5, **kwargs): super().__init__(example_path, test_file_path_list, enable_saving, n_gram, **kwargs) self.tagger = ner.SocketNER(host='localhost', port=8081) self.tagged_dict = None
def __init__(self, cfg=''): min_len, max_len, ne_types, _, _ = (cfg + '::::').split(':', 4) self.max_length = int(max_len) if max_len else -1 self.min_length = int(min_len) if min_len else -1 self.ban_punct_only = (min_len >= 0) if ne_types: self.ne_types = re.compile('^([' + ne_types + '].+)$') self.ner = ner.SocketNER(host='localhost', port=8080) else: self.ne_types = None
def __init__(self): try: self.tagger = ner.SocketNER(host='localhost', port=9191) self.testServer() except ConnectionRefusedError: print("WARNING: connection to NER local server refused!") except: print("WARNING: the local NER doesn't work properly!")
def __init__(self): ner_configs = [ ConfigObj( os.path.join(os.path.dirname(__file__), 'Stanford_NER', config_file)) for config_file in NER_CONFIG_FILES ] self.ners = [ ner.SocketNER(host=ner_config['NER_HOST'], port=int(ner_config['NER_PORT'])) for ner_config in ner_configs ]
def queryForEntity(expectedEntity, passage): tagger = ner.SocketNER(host='localhost', port=8081) # requires server to be started answer = tagger.get_entities(passage) answers = [] for j, currentExpectedEntity in enumerate(expectedEntity): for key in answer: if (key == currentExpectedEntity): for eachAnswer in answer[key]: answerString = eachAnswer.encode() answers.append(answerString) return answers
def annotate_sentence(self, text): self.tagger = ner.SocketNER("localhost", self.port, output_format='inlineXML') try: out = self.tagger.tag_text(text) except SocketError as e: if e.errno != errno.ECONNRESET: raise # Not error we are looking for print "socket error with sentence {}".format(text) except: print "other socket error!" out = self.tagger.tag_text(text) return out
def __init__(self): print "init NLP toolkit" self.tagger = ner.SocketNER(host='localhost', port=1234) # parse list of stopwords self.stoplist = [i.strip() for i in open(stopwords_file)] self.stoplist += weibo_stopwords # better support for traditional character jieba.set_dictionary(dico_file)
def __init__(self): # RSS feed of EventsDoha Website self.events_doha_link = 'http://www.eventsdoha.com/feed/' self.event_lst = [] #self.event_dict['type'] = 'FeatureCollection' #self.event_dict['features'] = [] self.db = 'webevents' self.db_connection = self.init_mongo() self.event_collection = self.db_connection[self.db]['events'] self.eventinfor_collection = self.db_connection[self.db]['eventinfor'] # creating object for the class self.geo = Geocode() # accessing the PYner server from QCRI self.tagger = ner.SocketNER(host='10.2.0.30', port=9190)
def SampleResource(): epoch = time.time() args = request.args.get('query') try: browser = mechanize.Browser() browser.set_handle_robots(False) browser.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] data = browser.open(args) tagger = ner.SocketNER(host='localhost', port=17017) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = sent_tokenizer.tokenize( nltk.clean_html(data.read().encode("utf-8"))) data = return_data(sentences) print time.time() - epoch k = [] for sentence in sentences: j = tagger.get_entities(sentence) if j.get("LOCATION") is not None: k = k + j.get("LOCATION") if j.get("ORGANIZATION") is not None: k = k + j.get("ORGANIZATION") if j.get("PERSON") is not None: k = k + j.get("PERSON") return jsonify({ "data": data[0], "data_head": { "data_tag": data[1], "image_src": None }, "tags": list(set(k)), "error": False, "success": True, }) except wikipedia.DisambiguationError as e: return jsonify({"data": e.__str__(), "error": True}) except IndexError: return jsonify({ "data": None, "error": True, "messege": "We dont have anything yet, related to %s" % args })
def __init__(self, document): super(Document, self).__init__() self.content = "" self.title = "" self.source = "" self.published = "" self.bag_of_words = BagOfWords() self.ner_tagger = ner.SocketNER(host='localhost', port=1239, output_format="slashTags") if Document.vocabulary is None: Document.vocabulary = BagOfWords() self.read_document(document) Document.number_of_documents += 1 self.list_rep = []
def getEntities(texts): if type(texts) != type([]): texts = [texts] """ Run the Stanford NER in server mode using the following command: java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/english.muc.7class.distsim.crf.ser.gz -port 8000 -outputFormat inlineXML """ tagger = ner.SocketNER(host='localhost', port=8000) entities = [] for t in texts: sentence_entities = tagger.get_entities(t) entities.append(sentence_entities) return entities
def ner_main(total): # corpus reader = moretags.read_gmb(moretags.corpus_root, 1000) data = list(reader) training_samples = data[:int(len(data) * 0.9)] test_samples = data[int(len(data) * 0.9):] chunker = moretags.NamedEntityChunker(training_samples[:5000]) print "#training samples = %s" % len(training_samples) # training samples = 55809 print "#test samples = %s" % len(test_samples) # test samples = 6201 # scrape data from wikipeida uri = '/wiki/Kevin_Bacon' links, info, text = get_info(uri) names = set() tagger = ner.SocketNER(host='localhost', port=4295, output_format='slashTags') result = [] while len(links) > 0 and len(names) < total: uri = links[random.randint(0, len(links) - 1)].attrs['href'] name = uri[6:] if name not in names: names.add(name) print (name) print ('#{}'.format(len(name))) try: links, info, text = get_info(uri) save_in_mongo(info, is_update=False) except Exception as err: print(err) continue # print('Name of this page is {0}\nInformation Card\n{1}\nUri:{2}' # .format(name, info, uri)) try: text = clean_data(text) """ result = ner_analyse(text,chunker) rels=extract_rels(result) print ('Relations are \n\n\n{0}\n\n\n'.format(rels)) """ entities = ner_analyse_crfs(tagger, text) # print ('crf analysis result is \n{0}'.format(entities)) # result.draw() # result= nltk.tree2conlltags(result) # print ('Relation entities are like \n{0}'.format(result)) save_in_mysql(name, entities) except Exception as err: print ('Named Entity Analysis Error:\n{0}'.format(err))
def start(): ''' 两种使用方法: 1.在IDE中运行 2.在命令行中使用 ''' if len(sys.argv) < 2: '''IDE运行''' file_dir_path = r'C:\Users\bnuzgn\Desktop\Xin' save_dir_path = r'C:\Users\bnuzgn\Desktop\Xin2' elif len(sys.argv) == 3: '''命令行运行''' file_dir_path = sys.argv[1] save_dir_path = sys.argv[2] else: '''出现错误''' print('参数错误:参数1为读取路径,2为保存路径。') return '''用于判断并新建save文件夹''' new_dir_path = createDir(save_dir_path) if not os.path.isdir(file_dir_path): print('不存在' + file_dir_path) return '''启动pynersocket''' tagger = ner.SocketNER(host='localhost', port=9191) '''读取name.list并加载''' nameDicPath = r'C:\Users\bnuzgn\Desktop\name.txt' with open(nameDicPath, 'r', encoding='utf-8') as f: nameLines = f.readlines() nameDicList = [] for line in nameLines: nameDicList.append(line.split()[1]) '''对file_dir_path文件中的所有文件进行处理''' for dirpath, dirnames, filenames in os.walk(file_dir_path): for filename in filenames: if filename[-4:] == '.txt': '''编码转换''' # Convert(dirpath,filename,new_dir_path) '''行处理''' name_list = RowProcess(dirpath, filename, new_dir_path) if not name_list: continue '''单词处理''' WordProcess(dirpath, filename, new_dir_path, name_list, nameDicList, tagger)
def genNER(OutputFileName, inputFileName): NER_uid = 0 tagger = ner.SocketNER(host='localhost', port=8080) # because we are doing NER, so we don't need to remove stopwords # stoplist = set('\",\',rt,\'tis,\'twas,able,about,across,after,ain\'t,all,almost,also,among,and,any,are,aren\'t,because,been,but,can,can\'t,cannot,could,could\'ve,couldn\'t,dear,did,didn\'t,does,doesn\'t,don\'t,either,else,ever,every,for,from,get,got,had,has,hasn\'t,have,he\'d,he\'ll,he\'s,her,hers,him,his,how,how\'d,how\'ll,how\'s,however,i\'d,i\'ll,i\'m,i\'ve,into,isn\'t,it\'s,its,just,least,let,like,likely,may,might,might\'ve,mightn\'t,most,must,must\'ve,mustn\'t,neither,nor,not,off,often,only,other,our,own,rather,said,say,says,shan\'t,she,she\'d,she\'ll,she\'s,should,should\'ve,shouldn\'t,since,some,than,that,that\'ll,that\'s,the,their,them,then,there,there\'s,these,they,they\'d,they\'ll,they\'re,they\'ve,this,tis,too,twas,wants,was,wasn\'t,we\'d,we\'ll,we\'re,were,weren\'t,what,what\'d,what\'s,when,when,when\'d,when\'ll,when\'s,where,where\'d,where\'ll,where\'s,which,while,who,who\'d,who\'ll,who\'s,whom,why,why\'d,why\'ll,why\'s,will,with,won\'t,would,would\'ve,wouldn\'t,yet,you,you\'d,you\'ll,you\'re,you\'ve,your,a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your'.split(",")) with open(OutputFileName, 'wb') as outputFile: csvWriter = csv.writer(outputFile, delimiter=',', quotechar='"') with open(inputFileName, 'rb') as inputFile: csvReader = csv.reader(inputFile, delimiter=',', quotechar='"') for row in csvReader: if len(row) > 1: NER_result_array = tagger.get_entities(row[1]) for NER_class in NER_result_array: for NER_item in NER_result_array[NER_class]: # print in CSV csvWriter.writerow( [NER_uid, row[0], NER_class, NER_item]) NER_uid += 1 if NER_uid % 10 == 0: print NER_uid, "have done."
def get_candidates(self, corpus): try: self.start_ner_server() tagger = ner.SocketNER(host='localhost', port=NER_PORT) person_fd = Counter() org_fd = Counter() logger.info('Getting candidates with NER') with open(corpus) as fi: for lnum, ll in enumerate(fi): entity_dict = tagger.get_entities(ll.strip()) person_fd.update({i.lower() for i in entity_dict.get(u'PERSON', []) if MIN_ENTITY_LENGTH <= len(i) <= MAX_ENTITY_LENGTH and len(i.split()) <= MAX_ENTITY_WORDS}) org_fd.update({i.lower() for i in entity_dict.get(u'ORGANIZATION', []) if len(i) <= MAX_ENTITY_LENGTH and len(i.split()) <= MAX_ENTITY_WORDS}) if lnum%1000 == 0: logger.info("Line %d"%lnum) person_candidates = {ne for ne in person_fd if person_fd[ne] > 1} org_candidates = {ne for ne in org_fd if org_fd[ne] > 1} logger.info('Person candidates: %d, Org candidates: %d'%(len(person_candidates), len(org_candidates))) return person_candidates, org_candidates finally: self.stop_ner_server()
import unidecode import ner import sys remote_mongo_url = 'mongodb://*****:*****@ec2-34-212-201-251.us-west-2.compute.amazonaws.com/nee_experiment' client = pymongo.MongoClient(remote_mongo_url) db = client.nee_experiment standfor_model_url = 'http://localhost:9000/' stanford_properties_map = {'annotators': 'ner', 'outputFormat': 'json'} stanford_params_map = { 'properties': json.dumps(stanford_properties_map), 'pipelineLanguage': 'es' } vision_model = ner.SocketNER(host='localhost', port=9191) punctuation = '!"#%&\'\"()*+,-./:;<=>?[\\]^_`{|}~' def evaluate(): stanford_metrics = {} vision_metrics = {} tagged_tweets = db.tweets.find( {'$and': [{ 'nee_entities': { '$exists': True } }]})
def _check_ner_server(self): test_text = u'Kobe Bryant plays for LA Lakers' tagger = ner.SocketNER(host='localhost', port=NER_PORT) return tagger.get_entities(test_text)
# Pyner empty dictionnary >>>import ner >>>tagger = ner.SocketNER(host='localhost', port=8081) >>>tagger.get_entities("University of California is located in California, United States")
def get_entities(self, pageText): tagger = ner.SocketNER(host='localhost', port=8080) entities = tagger.get_entities(pageText) return entities
class EntityExtractor: vision_model = ner.SocketNER(host='localhost', port=9191) standfor_model_url = 'http://*****:*****@'): return False return True def __unify_types(self, type): if type in number_types: return 'NUMBER' if type in location_types: return 'LOCATION' return type
# Help screen if filename == "-h" : print "".join([ "\n","\t","This is a test script for parts-of-speech analysis -- issue:","\n" ]) print "".join([ "\t","\t",scriptname," $FIL.seg > $FIL.pos or" ]) print "".join([ "\t","\t",scriptname," $FIL.seg | sponge $FIL.seg" ]) print "".join([ "\n","\t","or use the seg-PartsOfSpeech-stanford bash script for bulk processing." ]) print "".join([ "\n","\t","See also seg-PartsOfSpeech-MBSP.","\n" ]) quit() # Libraries import datetime, re # Define the taggers (see PartsOfSpeech-StanfordNLP-01.py for nltk client) # Currently configured -sentenceDelimiter newline -tokenize false import ner Mix = ner.SocketNER(host='localhost', port=9020, output_format='slashTags') UPP = ner.SocketNER(host='localhost', port=9021, output_format='slashTags') # Pattern for making sure sentences are split # http://www.clips.ua.ac.be/pages/pattern-en from pattern.en import tokenize # Counter n = 0 # A. Get the lines from the file with open(filename) as fp: for line in fp: # B. Split each line into fields field = line.split("|")
def parse_sent(): # initializing some variables locations = [] dates = [] uniformDates = [] durations = [] cal = pdt.Calendar() # quick error check if not request.json or not 'sentence' in request.json: abort(400) # connect to the instance of stanford ner tagger = ner.SocketNER(host='localhost', port=8080) sentence = request.json['sentence'] # gets the named entities (LOCATION, DATE) parsedSent = tagger.get_entities(sentence) # handle the absense of "DURATION" tokens = nltk.word_tokenize(sentence) # nltk's pos tagger pos_tags = nltk.pos_tag(tokens) # populate dates and locations if they exist try: dates = parsedSent['DATE'] except KeyError: if ('tomorrow' or 'tommorow' or 'tommorrow') in tokens: dates.append('tomorrow') else: print 'no DATE found' try: locations = parsedSent['LOCATION'] except KeyError: print 'no LOCATION found' # iterate over each tuple of (word, pos) for i, (word, pos) in enumerate(pos_tags): if word.lower() == ('days' or 'months' or 'years' or 'day' or 'month' or 'year'): tup = pos_tags[i - 1] if tup[1] == 'CD': # if i-1 tagged as number # this is a valid duration durations.append(tup[0] + ' ' + pos_tags[i][0]) elif tup[0].lower() == 'a': # if i-1 is 'a' # this is a valid duration durations.append(tup[0] + ' ' + pos_tags[i][0]) # formatting the dates uniformly as mm/dd/yy for entry in dates: parsed = cal.parse(entry) month = '0' + str(parsed[0][1]) if (parsed[0][1] < 10) else str( parsed[0][1]) day = '0' + str(parsed[0][2]) if (parsed[0][2] < 10) else str( parsed[0][2]) twodig = str(parsed[0][0])[-2:] year = '0' + twodig if (int(twodig) < 10) else twodig newDate = month + '/' + day + '/' + year uniformDates.append(newDate) return jsonify({ 'locations': locations, 'dates': uniformDates, 'durations': durations }), 201
def start_classifier(): global tagger tagger = ner.SocketNER(host='localhost', port=8888)