Ejemplos de SocketNER en Python, ejemplos de ner.SocketNER en Python

Ejemplo n.º 1

1

Mostrar archivo

def SampleResource():
	epoch = time.time()
	args = request.args.get('query')
	try:
		wiki = wikiapi.WikiApi()
		results = wiki.find(args)
		article = wiki.get_article(results[0])
		k = []
		tagger = ner.SocketNER(host='localhost', port=17017)
		sent_tokenizer = nltk.tokenize.PunktSentenceTokenizer()
		sentences = sent_tokenizer.tokenize(article.content.encode("utf-8"))
        	print sentences
		data = return_data(article, sentences)
		print time.time() - epoch
		for sentence in sentences:
			j = tagger.get_entities(sentence)
			if j.get("LOCATION") is not None:
				k = k +  j.get("LOCATION")
			if j.get("ORGANIZATION") is not None:     
				k = k + j.get("ORGANIZATION")
			if j.get("PERSON") is not None:
				k = k + j.get("PERSON")
		
		return jsonify({"data": data[0],
			"data_head": {"data_tag": data[1], "image_src": article.image},
			"tags": list(set(k)),
			"error": False,
                        "success": True,})
        except wikipedia.DisambiguationError as e:
		return jsonify({"data": e.__str__(), "error": True})

	except IndexError:
		return jsonify({"data": None, "error": True, "messege": "We dont have anything yet, related to %s"%args })

Ejemplo n.º 2

0

Mostrar archivo

Archivo: stanfordner.py Proyecto: admukhty/IHP

 def test(self, corpus, port=9191):
     self.tagger = ner.SocketNER("localhost",
                                 port,
                                 output_format='inlineXML')
     tagged_sentences = []
     logging.info("sending sentences to tagger {}...".format(self.path))
     for isent, sid in enumerate(self.sids):
         print("Aditi " + str(sid))
         #out = self.tagger.tag_text(replace_abbreviations(" ".join([t.text for t in self.tokens[isent]])))
         #out = self.tagger.tag_text(self.sentences[isent])
         #text = self.sentences[isent]
         text = " ".join([t.text for t in self.tokens[isent]])
         #logging.info("tagging: {}/{} - {}={}".format(isent, len(self.sids), sid, did))
         try:
             out = self.tagger.tag_text(text)
             print(text)
         except SocketError as e:
             if e.errno != errno.ECONNRESET:
                 raise  # Not error we are looking for
             print("socket error with sentence {}".format(text))
         except:
             print("other socket error!")
             out = self.tagger.tag_text(text)
             #print text, out
             #out = text
         tagged_sentences.append(out)
         print(tagged_sentences)
     results = self.process_results(tagged_sentences, corpus)
     return results

Ejemplo n.º 3

0

Mostrar archivo

def get_ner_tagged_text(df):
    tagger = ner.SocketNER(host='localhost', port=8080)
    txt_tagged = []
    i = 1
    time_start = datetime.now()
    for txt in df.header.map(str) + df.body.map(str):
        try:
            txt_tagged.append(tagger.get_entities(txt))
            if i % 100 == 0:
                print 'Working on item:', i, '\tof', len(df)
                df_temp = df.ix[range(i)]
                df_temp['ner'] = txt_tagged
                df_temp = get_sentiment_score(df_temp)
                triplet = get_entity_sentment_triplet(df_temp)
                write_triplet_to_file(triplet)
                print 'Time remaining:', (datetime.now() -
                                          time_start) / i * (len(df) - i)
        except:
            txt_tagged.append('Error while tagging')
        i = i + 1
        #print 'Tagged:', tagger.get_entities(txt)
    df['ner'] = txt_tagged
    print 'NER Finished'
    #df_ner= pandas.io.json.read_json(txt_tagged)
    return df

Ejemplo n.º 4

0

Mostrar archivo

def produceNamedEntity(list_sent):
    """
        input:
            list of sentences in the file
        return:
            list of all the named entities in the file
    """

    # lmtzr = WordNetLemmatizer()
    pyner_tagger = ner.SocketNER(host='localhost', port=8080)

    list_ne = []
    for sent in list_sent:
        if len(sent.strip()) != 0:
            sent_ne_dict = pyner_tagger.get_entities(sent)
            sent_ne_dict_len = len(sent_ne_dict.items())

            if sent_ne_dict_len != 0:
                for entity, list_word in sent_ne_dict.items():
                    # print entity, list_word
                    for word in list_word:
                        if len(word) > 2:
                            list_ne.append(
                                str(word.lower()) + ':' + str(entity))

    return list_ne

Ejemplo n.º 5

0

Mostrar archivo

Archivo: locations_tag.py Proyecto: saranyan/adparsers

def locations_tag(directory):
    """
    Finds location terms in all text files in a given directory

    Input:
    directory - string representing the local directory to analyze

    Output:
    locations - dictionary mapping each file containing location terms
                to the terms
    """
    locations = {}
    tagger = ner.SocketNER(host='localhost', port=8080)
    subs = {
        '\n': '.  ',
        'co.': 'County',
        'Co.': 'County',
        'county': 'County',
        'A.T.': 'Arkansas',
        'M.T.': 'Mississippi'
    }
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r') as f:
                text = f.read().decode("utf8")
                text = preprocess(text, subs)
                entities = tagger.get_entities(text)
                if 'LOCATION' in entities:
                    locs = merge_locations(entities['LOCATION'], text)
                    locations[filename] = locs
    return locations

Ejemplo n.º 6

0

Mostrar archivo

def map_NER_per_page(page_text, NER_port=2020):
    '''
		Uses Stanford's NER to create a map of entities contained in the text.

		arguments:
			page_text: (string) the page's full text
			NER_port: (int) port the NER server runs on. defaults to 2020
	'''

    NER_socket = ner.SocketNER(host="localhost", port=NER_port)

    try:
        result = NER_socket.get_entities(page_text)
        entities = {}

        for entity_type in [str(key) for key in result.keys()]:
            entities[entity_type] = list(
                set([str(entity) for entity in result[entity_type]]))

        return entities
    except Exception as e:
        print "map_NER_per_page ERROR"
        print e, type(e)

    return None

Ejemplo n.º 7

0

Mostrar archivo

Archivo: nlp.py Proyecto: bluetechsky/FEM-YouTube-Video-Tagging

def extract_entities(text):
    print text
    tagger = ner.SocketNER(host='localhost', port=8080)
    tags = tagger.get_entities(text)
    if 'PERSON' in tags.keys():
        print tags['PERSON']
        print
        return set(tags['PERSON'])
    print
    return set([])

Ejemplo n.º 8

0

Mostrar archivo

 def __init__(self,
              example_path,
              test_file_path_list,
              enable_saving=False,
              n_gram=5,
              **kwargs):
     super().__init__(example_path, test_file_path_list, enable_saving,
                      n_gram, **kwargs)
     self.tagger = ner.SocketNER(host='localhost', port=8081)
     self.tagged_dict = None

Ejemplo n.º 9

0

Mostrar archivo

 def __init__(self, cfg=''):
     min_len, max_len, ne_types, _, _ = (cfg + '::::').split(':', 4)
     self.max_length = int(max_len) if max_len else -1
     self.min_length = int(min_len) if min_len else -1
     self.ban_punct_only = (min_len >= 0)
     if ne_types:
         self.ne_types = re.compile('^([' + ne_types + '].+)$')
         self.ner = ner.SocketNER(host='localhost', port=8080)
     else:
         self.ne_types = None

Ejemplo n.º 10

0

Mostrar archivo

Archivo: adeptner.py Proyecto: ermoso5/PubMedConceptAnalysis

 def __init__(self):   
     try:
         self.tagger = ner.SocketNER(host='localhost', port=9191)
         self.testServer()     
     
     except ConnectionRefusedError:
         print("WARNING: connection to NER local server refused!")
             
     except:
         print("WARNING: the local NER doesn't work properly!")

Ejemplo n.º 11

0

Mostrar archivo

Archivo: ner_util.py Proyecto: syx528911137/alana_learning_to_rank

 def __init__(self):
     ner_configs = [
         ConfigObj(
             os.path.join(os.path.dirname(__file__), 'Stanford_NER',
                          config_file)) for config_file in NER_CONFIG_FILES
     ]
     self.ners = [
         ner.SocketNER(host=ner_config['NER_HOST'],
                       port=int(ner_config['NER_PORT']))
         for ner_config in ner_configs
     ]

Ejemplo n.º 12

0

Mostrar archivo

Archivo: similarity.py Proyecto: shubhangikumar/NLP_QA_Project2

def queryForEntity(expectedEntity, passage):
    tagger = ner.SocketNER(host='localhost',
                           port=8081)  # requires server to be started
    answer = tagger.get_entities(passage)
    answers = []
    for j, currentExpectedEntity in enumerate(expectedEntity):
        for key in answer:
            if (key == currentExpectedEntity):
                for eachAnswer in answer[key]:
                    answerString = eachAnswer.encode()
                    answers.append(answerString)
    return answers

Ejemplo n.º 13

0

Mostrar archivo

 def annotate_sentence(self, text):
     self.tagger = ner.SocketNER("localhost", self.port, output_format='inlineXML')
     try:
         out = self.tagger.tag_text(text)
     except SocketError as e:
         if e.errno != errno.ECONNRESET:
             raise  # Not error we are looking for
         print "socket error with sentence {}".format(text)
     except:
         print "other socket error!"
         out = self.tagger.tag_text(text)
     return out

Ejemplo n.º 14

0

Mostrar archivo

Archivo: nlp.py Proyecto: clemsos/mitras

    def __init__(self):

        print "init NLP toolkit"

        self.tagger = ner.SocketNER(host='localhost', port=1234)

        # parse list of stopwords
        self.stoplist = [i.strip() for i in open(stopwords_file)]
        self.stoplist += weibo_stopwords

        # better support for traditional character
        jieba.set_dictionary(dico_file)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: event_scraping.py Proyecto: vipyoung/eventsDetection

 def __init__(self):
     # RSS feed of EventsDoha Website
     self.events_doha_link = 'http://www.eventsdoha.com/feed/'
     self.event_lst = []
     #self.event_dict['type'] = 'FeatureCollection'
     #self.event_dict['features'] = []
     self.db = 'webevents'
     self.db_connection = self.init_mongo()
     self.event_collection = self.db_connection[self.db]['events']
     self.eventinfor_collection = self.db_connection[self.db]['eventinfor']
     # creating object for the class
     self.geo = Geocode()
     # accessing the PYner server from QCRI
     self.tagger = ner.SocketNER(host='10.2.0.30', port=9190)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: other_websites.py Proyecto: GraphicalDot/Wiki-event-sorter

def SampleResource():
    epoch = time.time()
    args = request.args.get('query')
    try:
        browser = mechanize.Browser()
        browser.set_handle_robots(False)
        browser.addheaders = [(
            'User-agent',
            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
        )]
        data = browser.open(args)
        tagger = ner.SocketNER(host='localhost', port=17017)
        sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = sent_tokenizer.tokenize(
            nltk.clean_html(data.read().encode("utf-8")))
        data = return_data(sentences)
        print time.time() - epoch
        k = []
        for sentence in sentences:
            j = tagger.get_entities(sentence)
            if j.get("LOCATION") is not None:
                k = k + j.get("LOCATION")
            if j.get("ORGANIZATION") is not None:
                k = k + j.get("ORGANIZATION")
            if j.get("PERSON") is not None:
                k = k + j.get("PERSON")

        return jsonify({
            "data": data[0],
            "data_head": {
                "data_tag": data[1],
                "image_src": None
            },
            "tags": list(set(k)),
            "error": False,
            "success": True,
        })
    except wikipedia.DisambiguationError as e:
        return jsonify({"data": e.__str__(), "error": True})

    except IndexError:
        return jsonify({
            "data":
            None,
            "error":
            True,
            "messege":
            "We dont have anything yet, related to %s" % args
        })

Ejemplo n.º 17

0

Mostrar archivo

Archivo: documentrep.py Proyecto: SoAG/articleclustering

 def __init__(self, document):
     super(Document, self).__init__()
     self.content = ""
     self.title = ""
     self.source = ""
     self.published = ""
     self.bag_of_words = BagOfWords()
     self.ner_tagger = ner.SocketNER(host='localhost',
                                     port=1239,
                                     output_format="slashTags")
     if Document.vocabulary is None:
         Document.vocabulary = BagOfWords()
     self.read_document(document)
     Document.number_of_documents += 1
     self.list_rep = []

Ejemplo n.º 18

0

Mostrar archivo

def getEntities(texts):

    if type(texts) != type([]):
        texts = [texts]
    """
        Run the Stanford NER in server mode using the following command:
        java -mx1000m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/english.muc.7class.distsim.crf.ser.gz -port 8000 -outputFormat inlineXML
        """

    tagger = ner.SocketNER(host='localhost', port=8000)
    entities = []
    for t in texts:
        sentence_entities = tagger.get_entities(t)
        entities.append(sentence_entities)
    return entities

Ejemplo n.º 19

0

Mostrar archivo

def ner_main(total):
    # corpus
    reader = moretags.read_gmb(moretags.corpus_root, 1000)
    data = list(reader)
    training_samples = data[:int(len(data) * 0.9)]
    test_samples = data[int(len(data) * 0.9):]
    chunker = moretags.NamedEntityChunker(training_samples[:5000])
    print "#training samples = %s" % len(training_samples)  # training samples = 55809
    print "#test samples = %s" % len(test_samples)  # test samples = 6201

    # scrape data from wikipeida
    uri = '/wiki/Kevin_Bacon'
    links, info, text = get_info(uri)
    names = set()
    tagger = ner.SocketNER(host='localhost', port=4295, output_format='slashTags')
    result = []
    while len(links) > 0 and len(names) < total:
        uri = links[random.randint(0, len(links) - 1)].attrs['href']
        name = uri[6:]
        if name not in names:
            names.add(name)
            print (name)
            print ('#{}'.format(len(name)))
            try:
                links, info, text = get_info(uri)
                save_in_mongo(info, is_update=False)
            except Exception as err:
                print(err)
                continue
            # print('Name of this page is {0}\nInformation Card\n{1}\nUri:{2}'
            #      .format(name, info, uri))
            try:
                text = clean_data(text)
                """
                result = ner_analyse(text,chunker)
                rels=extract_rels(result)
                print ('Relations are \n\n\n{0}\n\n\n'.format(rels))
                """

                entities = ner_analyse_crfs(tagger, text)
                # print ('crf analysis result is \n{0}'.format(entities))
                # result.draw()
                # result= nltk.tree2conlltags(result)
                # print ('Relation entities are like \n{0}'.format(result))
                save_in_mysql(name, entities)
            except Exception as err:
                print ('Named Entity Analysis Error:\n{0}'.format(err))

Ejemplo n.º 20

0

Mostrar archivo

def start():
    '''
      两种使用方法：
        1.在IDE中运行
        2.在命令行中使用
    '''
    if len(sys.argv) < 2:
        '''IDE运行'''
        file_dir_path = r'C:\Users\bnuzgn\Desktop\Xin'
        save_dir_path = r'C:\Users\bnuzgn\Desktop\Xin2'
    elif len(sys.argv) == 3:
        '''命令行运行'''
        file_dir_path = sys.argv[1]
        save_dir_path = sys.argv[2]
    else:
        '''出现错误'''
        print('参数错误：参数1为读取路径，2为保存路径。')
        return
    '''用于判断并新建save文件夹'''
    new_dir_path = createDir(save_dir_path)
    if not os.path.isdir(file_dir_path):
        print('不存在' + file_dir_path)
        return
    '''启动pynersocket'''
    tagger = ner.SocketNER(host='localhost', port=9191)
    '''读取name.list并加载'''
    nameDicPath = r'C:\Users\bnuzgn\Desktop\name.txt'
    with open(nameDicPath, 'r', encoding='utf-8') as f:
        nameLines = f.readlines()
    nameDicList = []
    for line in nameLines:
        nameDicList.append(line.split()[1])
    '''对file_dir_path文件中的所有文件进行处理'''
    for dirpath, dirnames, filenames in os.walk(file_dir_path):
        for filename in filenames:
            if filename[-4:] == '.txt':
                '''编码转换'''
                #                Convert(dirpath,filename,new_dir_path)
                '''行处理'''
                name_list = RowProcess(dirpath, filename, new_dir_path)
                if not name_list:
                    continue
                '''单词处理'''
                WordProcess(dirpath, filename, new_dir_path, name_list,
                            nameDicList, tagger)

Ejemplo n.º 21

0

Mostrar archivo

def genNER(OutputFileName, inputFileName):

    NER_uid = 0
    tagger = ner.SocketNER(host='localhost', port=8080)
    # because we are doing NER, so we don't need to remove stopwords
    #    stoplist = set('\",\',rt,\'tis,\'twas,able,about,across,after,ain\'t,all,almost,also,among,and,any,are,aren\'t,because,been,but,can,can\'t,cannot,could,could\'ve,couldn\'t,dear,did,didn\'t,does,doesn\'t,don\'t,either,else,ever,every,for,from,get,got,had,has,hasn\'t,have,he\'d,he\'ll,he\'s,her,hers,him,his,how,how\'d,how\'ll,how\'s,however,i\'d,i\'ll,i\'m,i\'ve,into,isn\'t,it\'s,its,just,least,let,like,likely,may,might,might\'ve,mightn\'t,most,must,must\'ve,mustn\'t,neither,nor,not,off,often,only,other,our,own,rather,said,say,says,shan\'t,she,she\'d,she\'ll,she\'s,should,should\'ve,shouldn\'t,since,some,than,that,that\'ll,that\'s,the,their,them,then,there,there\'s,these,they,they\'d,they\'ll,they\'re,they\'ve,this,tis,too,twas,wants,was,wasn\'t,we\'d,we\'ll,we\'re,were,weren\'t,what,what\'d,what\'s,when,when,when\'d,when\'ll,when\'s,where,where\'d,where\'ll,where\'s,which,while,who,who\'d,who\'ll,who\'s,whom,why,why\'d,why\'ll,why\'s,will,with,won\'t,would,would\'ve,wouldn\'t,yet,you,you\'d,you\'ll,you\'re,you\'ve,your,a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your'.split(","))

    with open(OutputFileName, 'wb') as outputFile:
        csvWriter = csv.writer(outputFile, delimiter=',', quotechar='"')
        with open(inputFileName, 'rb') as inputFile:
            csvReader = csv.reader(inputFile, delimiter=',', quotechar='"')
            for row in csvReader:
                if len(row) > 1:
                    NER_result_array = tagger.get_entities(row[1])
                    for NER_class in NER_result_array:
                        for NER_item in NER_result_array[NER_class]:
                            # print in CSV
                            csvWriter.writerow(
                                [NER_uid, row[0], NER_class, NER_item])
                            NER_uid += 1
                            if NER_uid % 10 == 0:
                                print NER_uid, "have done."

Ejemplo n.º 22

0

Mostrar archivo

    def get_candidates(self, corpus):
        try:
            self.start_ner_server()
            tagger = ner.SocketNER(host='localhost', port=NER_PORT)
            person_fd = Counter()
            org_fd = Counter()
            logger.info('Getting candidates with NER')
            with open(corpus) as fi:
                for lnum, ll in enumerate(fi):
                    entity_dict = tagger.get_entities(ll.strip())
                    person_fd.update({i.lower() for i in entity_dict.get(u'PERSON', [])
                        if MIN_ENTITY_LENGTH <= len(i) <= MAX_ENTITY_LENGTH and len(i.split()) <= MAX_ENTITY_WORDS})
                    org_fd.update({i.lower() for i in entity_dict.get(u'ORGANIZATION', [])
                        if len(i) <= MAX_ENTITY_LENGTH and len(i.split()) <= MAX_ENTITY_WORDS})
                    if lnum%1000 == 0:
                        logger.info("Line %d"%lnum)
            person_candidates = {ne for ne in person_fd if person_fd[ne] > 1}
            org_candidates = {ne for ne in org_fd if org_fd[ne] > 1}

            logger.info('Person candidates: %d, Org candidates: %d'%(len(person_candidates), len(org_candidates)))
            return person_candidates, org_candidates
        finally:
            self.stop_ner_server()

Ejemplo n.º 23

0

Mostrar archivo

Archivo: model_evaluator.py Proyecto: cjcarvajal/nee-experimenter

import unidecode
import ner
import sys

remote_mongo_url = 'mongodb://*****:*****@ec2-34-212-201-251.us-west-2.compute.amazonaws.com/nee_experiment'
client = pymongo.MongoClient(remote_mongo_url)
db = client.nee_experiment

standfor_model_url = 'http://localhost:9000/'
stanford_properties_map = {'annotators': 'ner', 'outputFormat': 'json'}
stanford_params_map = {
    'properties': json.dumps(stanford_properties_map),
    'pipelineLanguage': 'es'
}

vision_model = ner.SocketNER(host='localhost', port=9191)

punctuation = '!"#%&\'\"()*+,-./:;<=>?[\\]^_`{|}~'


def evaluate():

    stanford_metrics = {}
    vision_metrics = {}

    tagged_tweets = db.tweets.find(
        {'$and': [{
            'nee_entities': {
                '$exists': True
            }
        }]})

Ejemplo n.º 24

0

Mostrar archivo

 def _check_ner_server(self):
     test_text = u'Kobe Bryant plays for LA Lakers'
     tagger = ner.SocketNER(host='localhost', port=NER_PORT)
     return tagger.get_entities(test_text)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: python_27745.py Proyecto: ver007/code_extraction

# Pyner empty dictionnary
&gt;&gt;&gt;import ner
&gt;&gt;&gt;tagger = ner.SocketNER(host='localhost', port=8081)
&gt;&gt;&gt;tagger.get_entities("University of California is located in California, United States")

Ejemplo n.º 26

0

Mostrar archivo

 def get_entities(self, pageText):
         tagger = ner.SocketNER(host='localhost', port=8080)
         entities = tagger.get_entities(pageText)
         return entities

Ejemplo n.º 27

0

Mostrar archivo

class EntityExtractor:

    vision_model = ner.SocketNER(host='localhost', port=9191)
    standfor_model_url = 'http://*****:*****@'):
            return False
        return True

    def __unify_types(self, type):
        if type in number_types:
            return 'NUMBER'
        if type in location_types:
            return 'LOCATION'
        return type

Ejemplo n.º 28

0

Mostrar archivo

# Help screen
if filename == "-h" :  
   print "".join([ "\n","\t","This is a test script for parts-of-speech analysis -- issue:","\n" ])
   print "".join([ "\t","\t",scriptname," $FIL.seg > $FIL.pos or" ])
   print "".join([ "\t","\t",scriptname," $FIL.seg | sponge $FIL.seg" ])
   print "".join([ "\n","\t","or use the seg-PartsOfSpeech-stanford bash script for bulk processing." ])
   print "".join([ "\n","\t","See also seg-PartsOfSpeech-MBSP.","\n" ])
   quit()

# Libraries
import datetime, re

# Define the taggers (see PartsOfSpeech-StanfordNLP-01.py for nltk client)
# Currently configured -sentenceDelimiter newline -tokenize false
import ner
Mix = ner.SocketNER(host='localhost', port=9020, output_format='slashTags')
UPP = ner.SocketNER(host='localhost', port=9021, output_format='slashTags') 

# Pattern for making sure sentences are split
# http://www.clips.ua.ac.be/pages/pattern-en
from pattern.en import tokenize

# Counter
n = 0

# A. Get the lines from the file
with open(filename) as fp:
   for line in fp:

# B. Split each line into fields
      field = line.split("|")

Ejemplo n.º 29

0

Mostrar archivo

def parse_sent():

    # initializing some variables
    locations = []
    dates = []
    uniformDates = []
    durations = []
    cal = pdt.Calendar()

    # quick error check
    if not request.json or not 'sentence' in request.json:
        abort(400)

    # connect to the instance of stanford ner
    tagger = ner.SocketNER(host='localhost', port=8080)
    sentence = request.json['sentence']

    # gets the named entities (LOCATION, DATE)
    parsedSent = tagger.get_entities(sentence)

    # handle the absense of "DURATION"
    tokens = nltk.word_tokenize(sentence)  # nltk's pos tagger
    pos_tags = nltk.pos_tag(tokens)

    # populate dates and locations if they exist
    try:
        dates = parsedSent['DATE']
    except KeyError:
        if ('tomorrow' or 'tommorow' or 'tommorrow') in tokens:
            dates.append('tomorrow')
        else:
            print 'no DATE found'

    try:
        locations = parsedSent['LOCATION']
    except KeyError:
        print 'no LOCATION found'

    # iterate over each tuple of (word, pos)
    for i, (word, pos) in enumerate(pos_tags):
        if word.lower() == ('days' or 'months' or 'years' or 'day' or 'month'
                            or 'year'):
            tup = pos_tags[i - 1]
            if tup[1] == 'CD':  # if i-1 tagged as number
                # this is a valid duration
                durations.append(tup[0] + ' ' + pos_tags[i][0])
            elif tup[0].lower() == 'a':  # if i-1 is 'a'
                # this is a valid duration
                durations.append(tup[0] + ' ' + pos_tags[i][0])

    # formatting the dates uniformly as mm/dd/yy
    for entry in dates:
        parsed = cal.parse(entry)
        month = '0' + str(parsed[0][1]) if (parsed[0][1] < 10) else str(
            parsed[0][1])
        day = '0' + str(parsed[0][2]) if (parsed[0][2] < 10) else str(
            parsed[0][2])
        twodig = str(parsed[0][0])[-2:]
        year = '0' + twodig if (int(twodig) < 10) else twodig
        newDate = month + '/' + day + '/' + year
        uniformDates.append(newDate)

    return jsonify({
        'locations': locations,
        'dates': uniformDates,
        'durations': durations
    }), 201

Ejemplo n.º 30

0

Mostrar archivo

Archivo: classifier.py Proyecto: hieutnbk2011/cloudrun

def start_classifier():
    global tagger
    tagger = ner.SocketNER(host='localhost', port=8888)