def __init__(self): self.logger = logging.getLogger('horus') self.conf = HorusConfig() if len(self.logger.handlers) == 0: self.logger.setLevel(logging.DEBUG) now = datetime.datetime.now() handler = logging.FileHandler(self.conf.dir_log + 'horus_' + now.strftime("%Y-%m-%d") + '.log') formatter = logging.Formatter( "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s" ) handler.setFormatter(formatter) self.logger.addHandler(handler) consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(formatter) self.logger.addHandler(consoleHandler)
def main(): op = OptionParser(usage='usage: %prog [options] arguments (example: main.py --text="paris hilton was once the toast of the town."') op.add_option("--text", dest="text", help="The text to be annotated") op.add_option("--file", dest="file", help="The file to be annotated") op.add_option("--ds_format", dest="ds_format", default=0, help="The format to be annotated [0 = input text (default), 1 = Ritter, 2 = CoNNL]") op.add_option("--output_file", dest="output_file", default="horus_out", help="The output file") op.add_option("--output_format", dest="output_format", default="json", help="The output file type") (opts, args) = op.parse_args() print(__doc__) op.print_help() if not opts.text and not opts.file: op.error('inform either an [text] or [file] as parameter!') config = HorusConfig() extractor = FeatureExtraction(config, load_sift=1, load_tfidf=1, load_cnn=0, load_topic_modeling=1) print(extractor.extract_features_text(opts.text))
try: text = self.clean_text(text) if len(str(text)) == 0: return '' if isinstance(text, unicode) == True: text = text.encode('ascii', 'ignore') params = {'text': text, 'to': to_lang} h = self.get_header() print(h) translationData = requests.get(self.translateUrl, params=params, headers=h) #urllib.urlencode() if translationData.status_code != 200: raise Exception(':: error: bing translation status code: ' + str(translationData.status_code) + ' - ' + str(translationData.text)) translation = ElementTree.fromstring( translationData.text.encode('utf-8')) return translation.text except: raise if __name__ == "__main__": config = HorusConfig() t = BingTranslator(config) print t.translate("hey what's up dude?", 'pt-br') print t.detect_language("Que lingua estou falando, amigo?") print t.detect_language("Green Newsfeed") print t.translate("Green Newsfeed", 'pt')
def __init__(self): config = HorusConfig() self.stanford_ner = StanfordNERTagger(self.config.model_stanford_filename_ner, self.config.model_stanford_path_jar_ner) self.stanford_pos = StanfordPOSTagger(self.config.model_stanford_filename_pos, self.config.model_stanford_path_jar_pos) self.stanford_pos.java_options='-mx8g' self.word2vec_google = gensim.models.KeyedVectors.load_word2vec_format(config.embeddings_path, binary=True)
import spacy import en_core_web_sm import os from src.config import HorusConfig os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' nlp = en_core_web_sm.load() #spacy.load('en') conf = HorusConfig() import shorttext #emb = '/Volumes/dne5data/embeddings/GoogleNews-vectors-negative300.bin.gz' #emb = '/Users/diegoesteves/Downloads/GoogleNews-vectors-negative300 (1).bin' dict = { 'per': [ 'arnett', 'david', 'richard', 'james', 'frank', 'george', 'misha', 'students', 'education', 'coach', 'football', 'turkish', 'albanian', 'romanian', 'professor', 'lawyer', 'president', 'king', 'man', 'woman', 'danish', 'we', 'he', 'their', 'born', 'directed', 'died', 'lives', 'boss', 'syrian', 'elected', 'minister', 'candidate', 'daniel', 'robert', 'dude', 'guy', 'girl', 'woman', 'husband', 'actor', 'people', 'celebrity' ], 'loc': [ 'china', 'usa', 'germany', 'leipzig', 'alaska', 'poland', 'jakarta', 'kitchen', 'house', 'brazil', 'fuji', 'prison', 'portugal', 'lisbon', 'france', 'oslo', 'airport', 'road', 'highway', 'forest', 'sea', 'lake', 'stadium', 'hospital', 'temple', 'beach', 'hotel', 'country',