Esempio n. 1
0
 def __init__(self):
     self.logger = logging.getLogger('horus')
     self.conf = HorusConfig()
     if len(self.logger.handlers) == 0:
         self.logger.setLevel(logging.DEBUG)
         now = datetime.datetime.now()
         handler = logging.FileHandler(self.conf.root_dir + 'log/horus_'
                                       + now.strftime("%Y-%m-%d")
                                       + '.log')
         formatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")
         handler.setFormatter(formatter)
         self.logger.addHandler(handler)
         consoleHandler = logging.StreamHandler()
         consoleHandler.setFormatter(formatter)
         self.logger.addHandler(consoleHandler)
Esempio n. 2
0
import os

from horus.core.config import HorusConfig

config = HorusConfig()
if config.root_dir == '':
    ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
else:
    ROOT_DIR = config.root_dir

RUN_TAGGER_CMD = config.models_tweetnlp_java_param + " " + config.models_tweetnlp_jar + " --model " + config.models_tweetnlp_model

NER_RITTER_PER = ['B-person', 'I-person']
NER_RITTER_ORG = ['B-company', 'I-company']
NER_RITTER_LOC = ['B-geo-loc', 'I-geo-loc']

NER_STANFORD_PER = ['PERSON']
NER_STANFORD_ORG = ['ORGANIZATION', 'GSP']  # GSP = geo-political social group
NER_STANFORD_LOC = ['LOCATION']

NER_NLTK_PER = ['B-PERSON', 'I-PERSON', 'PERSON']
NER_NLTK_ORG = ['B-ORGANIZATION', 'I-ORGANIZATION', 'ORGANIZATION', 'GSP']
NER_NLTK_LOC = [
    'B-LOCATION', 'I-LOCATION', 'LOCATION', 'GPE'
]  # GPE = geo-political entities such as city, state/province, and country

NER_CONLL_PER = ['I-PER']
NER_CONLL_ORG = ['I-ORG']
NER_CONLL_LOC = [
    'I-LOC'
]  # GPE = geo-political entities such as city, state/province, and country
Esempio n. 3
0
class NLPTools(object):

    config = HorusConfig()

    def tokenize_and_pos_nltk(self, text):
        #TODO: esta trocando '' por ``
        tokens = text
        if type(text) is not list:
            tokens = nltk.word_tokenize(text.decode('utf-8'))

        #sd = [s.decode('utf8') for s in tokens]
        tagged = nltk.pos_tag(tokens)
        return tokens, tagged, nltk.pos_tag(tokens, tagset="universal")

    def annotate_ner_nltk(self, tagged):
        t = nltk.ne_chunk(tagged, binary=False)
        x = nltk.tree2conllstr(t)
        x = x.split('\n')
        ret = []
        for xi in x:
            ret.append(xi.split(' ')[2])
        return ret

    def annotate_ner_stanford(self, text):
        return self.stanford_ner.tag(text.split())

    def tokenize_and_pos_stanford(self, text):
        t = text.decode('utf-8')
        #sd = [s.decode('utf8') for s in text.split()]
        return t.split(), self.stanford_pos.tag(t.split()), []

    def tokenize_and_pos_twitter(self, text):
        tokens = []
        tagged = []
        pos_universal = []
        sd = [s.decode('utf8') for s in text.split()]
        #[w.decode('utf8').encode(encoding='ascii', errors='replace') for w in text.split()]
        pos_token_tag_sentence = CMUTweetTagger.runtagger_parse(sd)

        for sequence_tag in pos_token_tag_sentence:
            for token_tag in sequence_tag:
                tokens.append(token_tag[0])
                tagged.append(token_tag[1])
                pos_universal.append(
                    self.convert_penn_to_universal_tags(token_tag[1]))
        return tokens, zip(tokens, tagged), zip(tokens, pos_universal)

    #TODO: is it really necessary? I just changed the other adding "split" right before...no need to [] before
    def tokenize_and_pos_twitter_list(self, text):
        token_list = []
        tagged_list = []
        pos_universal_list = []
        pos_token_tag_sentence = CMUTweetTagger.runtagger_parse(text)

        for sequence_tag in pos_token_tag_sentence:
            tokens = []
            tagged = []
            pos_universal = []
            for token_tag in sequence_tag:
                tokens.append(token_tag[0])
                tagged.append(token_tag[1])
                pos_universal.append(
                    self.convert_penn_to_universal_tags(token_tag[1]))

            token_list.append(tokens)
            tagged_list.append(zip(tokens, tagged))
            pos_universal_list.append(zip(tokens, pos_universal))
        return token_list, tagged_list, pos_universal_list

    @staticmethod
    def convert_cmu_to_universal_tags(cmu_tag):
        for item in definitions.CMU_UNI_TAGS:
            if item[0] == cmu_tag:
                return item[1]
        return "X"

    @staticmethod
    def convert_penn_to_universal_tags(penn_tag):
        for item in definitions.PENN_UNI_TAG:
            if item[0] == penn_tag:
                return item[1]
        return penn_tag

    def __init__(self):
        self.stanford_ner = StanfordNERTagger(
            self.config.model_stanford_filename_ner,
            self.config.model_stanford_path_jar_ner)
        self.stanford_pos = StanfordPOSTagger(
            self.config.model_stanford_filename_pos,
            self.config.model_stanford_path_jar_pos)
        self.stanford_pos.java_options = '-mx8g'