def desc2domaine(description_cas, dom_logement=1, dom_famille=9): """Classifieur: détermine si la description appartient au droit de la famille ou au droit du logement. TODO: remplacer par une fonction qui utilise les doc2vec (je fais pas trop confiance au nearest neighbor — c'est presque toujours le pire classifieur) """ ttroot = os.path.abspath(os.path.join(os.getcwd(), "treetagger-install")) tagger = TreeTagger( TAGLANG="fr", TAGDIR=ttroot ) v = np.zeros(len(mots)) t = [ ln.split("\t") for ln in tagger.tag_text(description_cas) ] t = [ i[2] for i in t if len(i)==3 ] t = [ i for i in t for i in mots ] nmots = float(len(t)) for k, val in Counter(t).items(): v[mots.index(k)] = val / nmots dfamille = cosine(v, vec["famille"]) dlogement = cosine(v, vec["logement"]) if dlogement < dfamille: return dom_logement else: return dom_famille
def tag(text, tt_home): # Default NLTK's tokenizer # TreebankWordTokenizer + PunktSentenceTokenizer nltk_start = time() tokens = word_tokenize(text) # Default NLTK's POS tagger # ? # Use tagset='universal' for universal tagset nltk_tagged = pos_tag(tokens) nltk_end = time() nltk_execution = nltk_end - nltk_start logger.info("NLTK took %f seconds" % nltk_execution) # TreeTagger wrapper # Tokenization: ? # Default language: English # English: trained on Penn treebank # Default flags: -token -lemma -sgml -quiet -no-unknown tt_start = time() tt = TreeTagger(TAGDIR=tt_home) raw_tags = tt.tag_text(text) tt_end = time() tt_execution = tt_end - tt_start tt_tagged = make_tags(raw_tags) logger.info("TreeTagger took %f seconds" % tt_execution) return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
def __init__(self, auteur, numero, langue = "fr"): """Crée l'objet Oeuvre s'il n'existe pas encore et le sauvegarde dans un fichier du même nom. S'il existe déjà, on le reprend simplement dans le fichier.""" self.auteur = auteur self.numero = numero self.langue = langue self.categorie = None emplacement_textes = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers txt/" emplacement_oeuvres = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers oeuvres/" #self.infos = Infos(auteur,numero) print(auteur + str(numero), end = " ") try: with open(emplacement_oeuvres + auteur + str(numero), "rb") as mon_fichier: o = pickle.load(mon_fichier) self.texte_brut = o.texte_brut self.tags = o.tags self.mots = o.mots self.racines = o.racines self.POS = o.POS print("(importation terminee)", end = " / ") except FileNotFoundError: tagger = TreeTagger(TAGLANG = self.langue) self.texte_brut = formater(importer(auteur, numero,emplacement_textes)) self.tags = make_tags(tagger.tag_text(self.texte_brut)) self.mots = [t[0] for t in self.tags if len(t) == 3] self.racines = [t[2] for t in self.tags if len(t) == 3] self.POS = [t[1] for t in self.tags if len(t) == 3] with open(emplacement_oeuvres + "/" + auteur + str(numero), "wb") as mon_fichier: pickle.dump(self,mon_fichier,protocol = 2) print("(creation terminee)", end = " / ")
def build_stemed_day_array(df, date_range, stop_words, lang): # init TreeTagger tagger = TreeTagger(TAGLANG=lang, TAGDIR='./TreeTagger') res = [] # iterating over all days in the given day range for i in tqdm(date_range, desc='Stemming Tweets'): sentence = [] # selecting all tweets of a certain day for tweet in df[np.logical_and( df['date'] > i, df['date'] < i + pd.DateOffset(1))]['text']: tweet = format_text(tweet) # if the tweet has content left after cleaning, lemmatization will start if tweet != '': for word in [j.split('\t')[2] for j in tagger.tag_text(tweet)]: if len(word) < 3: continue if word.lower() in stop_words: continue # +,| and @ are added by TreeTagger for multiple meanings or composed words. # Will be ignored to prevend impurities if "+" in word or "|" in word or "@" in word: continue sentence.append(word) res.append(" ".join(sentence)) return res
class OOVclassifier(object): def __init__(self, stem=False): dictionaries = dicts() path = '/home/alangb/TWPP' # path to TreeTagger installation directory self.english_dict = enchant.Dict("en_EN") self.spanish_dict = enchant.Dict("es_ES") self.ND = dictionaries.norm self.SD = dictionaries.lemario self.PND = dictionaries.names self.stem = stem if stem: self.stemmer = SpanishStemmer() else: self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path) def dictionary_lookup(self, word): result = (word in self.SD or word in self.PND or word in self.ND.values()) return result def affix_check(self, word): result = False if word.islower() or word.istitle(): if self.stem: n = len(word) stem = self.stemmer.stem(word) # compare with first substring of length n of each word in SD for w in [x[:n] for x in self.SD if len(x) >= n]: result = (word == w) if result: break else: lemma = make_tags(self.tagger.tag_text(word))[0].lemma result = self.dictionary_lookup(lemma) return result def check(self, word): result = self.spanish_dict.check(word) if not result: result = self.dictionary_lookup(word) or self.affix_check(word) return result def check_NoES(self, word): result = False if len(word) > 1: result = self.english_dict.check(word) return result def classify(self, word): if self.check(word): result = 1 elif self.check_NoES(word): result = 2 else: result = 0 return result
def tagText(doc, lg): """ permet de tagger du texte avec TreeTagger attend une chaîne de caractères renvoie une liste d'éléments taggés sous ce format : ['je\tPRO:PER\tje', 'suis\tVER:pres\tsuivre|être'] """ assert doc, "Problème : l'élément à tagger est vide" tag_options = TreeTagger(TAGLANG=lg, TAGOPT="-token -lemma -sgml -no-unknown") tags = tag_options.tag_text(doc) return tags
def text2vec(description_cas): "Convertit du texte en vecteur sur l'espace du modèle Doc2Vec d2v" ttroot = os.path.abspath(os.path.join(os.getcwd(), "treetagger-install")) tagger = TreeTagger(TAGLANG="fr", TAGDIR=ttroot) t = [ ln.split("\t") for ln in tagger.tag_text(description_cas.lower()) ] t = [ i[2] for i in t if len(i)==3 ] t = [ i for i in t for i in d2v.wv.index2entity ] return d2v.infer_vector(t)
def add_pos_tags(self, treetaggerlocation, taglang="es"): # POS-tag the tokenized sentences according to language print("Adding POS-Tags...") pos_tagged_sents = pos_tag_sents(self.tokenizedLines_en) # pos_tagged_sents = [[(w1,tag1),(w2,tag2)],[(w1,t2),(w2,t2),...]...] for sent in pos_tagged_sents: fo = [] for word in sent: temp = word[0] + word[1] fo.append(temp) self.pos_tagged_sents_en.append( fo) # -> [["w1t1","w2t2",...],["w1t1",...],...] if taglang == "fi": tagger = TreeTagger(TAGLANG=taglang, TAGDIR=treetaggerlocation, TAGPARFILE="Preprocessing/fi_en/finnish.par") if taglang == "es": tagger = TreeTagger(TAGLANG=taglang, TAGDIR=treetaggerlocation, TAGPARFILE="Preprocessing/es_en/spanish.par") if taglang == "de": tagger = TreeTagger(TAGLANG=taglang, TAGDIR=treetaggerlocation, TAGPARFILE="Preprocessing/de_en/german.par") if taglang == "pl": tagger = TreeTagger(TAGLANG=taglang, TAGDIR=treetaggerlocation, TAGPARFILE="Preprocessing/pl_en/polish.par") pos_tagged_sents = [] for line in self.tokenizedLines_es: if "EE.UU" in line: line_t = [] for w in line: if w == "EE.UU": line_t.append("EEUU") else: line_t.append(w) line = line_t tags = tagger.tag_text(line) pos_tagged_sents.append(tags) # -> [['Esto\tDM\teste','es\tVSfin\tser', 'un\tART\tun','texto\tNC\ttexto',],[...],...] for i in range(len(pos_tagged_sents)): fo = [] for word in pos_tagged_sents[i]: temp = word.split( '\t') # 'esto\tDM\teste' => ['esto', 'DM', 'este'] word_n_tag = temp[0] + temp[ 1] # ['esto', 'DM', 'este'] => 'estoDM' fo.append(word_n_tag) self.pos_tagged_sents_es.append(fo)
def scraping(): """Si l'utilisateur n'a pas demandé un scraping, recherche de documents du pays sélectionné dans la base de données; ces documents et leurs liens vers les photos seront renvoyés. Si l'utilisateur a demandé un scraping, ou s'il n'y a pas ou pas assez de documents du pays sélectionné dans la base de données, configuration et lancement du scrape sur Reddit, puis étiquetage des titres des soumissions résultats par TreeTagger, et analyse des étiquettes pour obtenir une liste de lieux potentiels. Ces lieux sont recherchés sur geonames. Les résultats de cette dernière recherche sont chargés dans deux dictionnaires, l'un pour l'affichage des photos sur le site et l'autre pour stocker les résultats dans la base de données sur mongoDB. NB: le scraping tente toujours d'obtenir de nouvelles photos (absentes de mongoDB). """ #Configuration Geoscape geokey = current_app.config['GEOKEY'] geoauth = current_app.config['GEOAUTH'] #Paramètres de la requête Javascript rgnversion = request.args.get('search_version') country = request.args.get('country') country_code = request.args.get('country_code') limit = int(request.args.get('nombre_image')) scrape_requested = True if request.args.get( 'scraping') == 'true' else False #Dico de résultats pour l'affichage sur le site search_res = geo.GeoQuery(geokey, geoauth, country, country_code, 'E') dic_results = { 'head': { 'total': 0, 'country': { 'name': country, 'lng': search_res.result.lng, 'lat': search_res.result.lat } }, 'results': [] } #Liste de chargement pour la base de données database_list = [] if scrape_requested: #On ne charge que les img_url load_arg = {'img_url': 1, '_id': 0} else: #On charge le document pour l'affichage load_arg = { 'scraped_title': 0, 'location_list': 0, 'feature_class': 0, 'testers': 0, '_id': 0 } existing_urls = [] check_db = mongo.Mongo.mongocheck('Resultats_RGN') #Initialisation de la collection des résultats si elle n'existe pas if not check_db: dbstart = mongo.MongoSave([{ 'key': 'Initialisation de la collection Resultats_RGN.' }]) dbstart.storeindb('Resultats_RGN', img_url='A', search_version='D') dbstart.nonunique_index('Resultats_RGN', country='A', search_version='D') #Les documents pris dans la base de données sont chargés dans le dictionnaire de résultats else: dbfinder = mongo.MongoLoad( { 'search_version': rgnversion, 'country': country }, load_arg) for doc in dbfinder.retrieve('Resultats_RGN', limit=limit): if not scrape_requested: dic_results['head']['total'] += 1 dic_results['results'].append(doc) existing_urls.append('-url:' + doc['img_url']) if scrape_requested or dic_results['head']['total'] < limit: #Configuration recherche reddit, profil chargé depuis praw.ini reddit = praw.Reddit('current_user') target_sub = reddit.subreddit('EarthPorn') query = country if country != 'United States' else 'USA' print( '\033[92m' + target_sub.display_name + '\033[0m' '\nRésultats de recherche pour les soumissions reddit avec:', query, '\n') #Exclure les documents déjà récupérés user_limit = limit if len(query) + len(existing_urls) + sum( len(url) for url in existing_urls) <= 512: query += (' ' + ' '.join(existing_urls)).rstrip() limit -= dic_results['head']['total'] else: #512 caractères max dans une requête Reddit limit = 1000 #Max permis par Reddit existing_urls = [url[5:] for url in existing_urls] #Config TreeTagger. Le dossier Treetagger doit être dans le dossier d'où le programme est exécuté if sys.platform.startswith('linux'): reddit_tagger = TreeTagger(TAGLANG='en', TAGDIR=join(getcwd(), 'Treetagger', 'TreeTagger_unix')) elif sys.platform.startswith('win'): reddit_tagger = TreeTagger(TAGLANG='en', TAGDIR=join(getcwd(), 'Treetagger', 'TreeTagger_windows')) else: sys.exit('Système d\'exploitation non compatible avec Geoscape.') #Résultats de la recherche dans le subreddit test_posts = target_sub.search(query, limit=limit) for post in test_posts: try: attempt = post.url except prawcore.exceptions.NotFound: continue #Problème avec la photo; éliminé if post.url in existing_urls: continue #Déjà stocké dans la base de données; éliminé if search('\W' + country + '\W', post.title): #Pays comme mot distinct #Saute aux plus une fois des caractères entre [] ou () au début du texte et s'arrête au premier [ ou ( res = search('^(?:[\[(].*[\])])?([^\[(]+)', post.title) if (res): print(res.group(1)) #Tagging: génère une liste de triplets: (word=..., pos=..., lemma=...) reddit_tags = make_tags(reddit_tagger.tag_text( res.group(1)), exclude_nottags=True) #Le nom du pays est exclu des lieux potentiels; rajouté seulement en dernier recours country_split = country.casefold().split(' ') size = len(country_split) indexes = [] if size > 1: name_tags = [t[0].casefold() for t in reddit_tags] for window in enumerate(windowed(name_tags, size)): if all(window[1][i] == country_split[i] for i in range(size)): indexes.extend([ i for i in range(window[0], window[0] + size) ]) for index, tag in enumerate(reddit_tags): if tag[1] == 'NP': #Tag nom propre sous Windows reddit_tags[index] = (tag[0], 'NP0', tag[2]) if tag[0].casefold() == country.casefold( ) or index in indexes: reddit_tags[index] = (tag[0], 'CTY', tag[2]) pprint(reddit_tags) #Recherche des lieux potentiels, avec stocké entre les lieux le nombre de mots non choisis location_list = location_finder(country, rgnversion, reddit_tags) print('Lieux trouvés:', end='') print(location_list, '\n') #Geonames date = gmtime(post.created_utc) dic_mongo = { 'link': 'https://www.reddit.com' + post.permalink, 'img_url': post.url, 'search_version': rgnversion, 'country': country, 'country_code': country_code, 'scraped_title': res.group(1).strip(), 'text': post.title, 'tag_list': reddit_tags, 'location_list': location_list, 'date': { 'year': date.tm_year, 'month': date.tm_mon, 'day': date.tm_mday, 'hour': date.tm_hour, 'min': date.tm_min, 'sec': date.tm_sec } } try: attempt = post.author.icon_img except prawcore.exceptions.NotFound: pass else: dic_mongo['author'] = { 'name': post.author.name, 'icon': post.author.icon_img, 'profile': 'https://www.reddit.com/user/' + post.author.name } """ R: recherche standard RF: recherche fuzzy E: recherche exacte EH: recherche exacte sur ensembles humains EN: recherche exacte sur ensembles naturels """ placefinder = geo.LocationList(country_code, location_list) geo_res = placefinder.geo_search(geokey, geoauth, 'EN EH', 'R') #Objet GeoQuery #En dernier recours, le pays lui-même s'il est dans le titre if geo_res.result is None and country in res.group(1): placefinder.reinit(country_code, [country]) geo_res = placefinder.geo_search(geokey, geoauth, 'E') if geo_res.result is not None: dic_results['head']['total'] += 1 print('Résultat GeoNames:', geo_res.result.address, end='') print('. Après', placefinder.counter, 'requêtes.') dic_mongo['name'] = geo_res.result.address #Nom dic_mongo['lng'] = geo_res.result.lng dic_mongo['lat'] = geo_res.result.lat dic_mongo[ 'feature_class'] = geo_res.result.feature_class dic_mongo['location'] = geo_res.location dic_results['results'].append(dic_mongo) dic_tostore = deepcopy(dic_mongo) database_list.append(dic_tostore) user_limit -= 1 if not user_limit: break print('\n###############') #Chargement dans la base de données des documents générés par le scrape documents = mongo.MongoSave(database_list) documents.storeindb('Resultats_RGN') return jsonify(dic_results)
class TTPosTagger(object): """ part-of-speech tagger implemented using tree tagger and treetaggerwrapper """ def __init__(self, language, tt_home=None, **kwargs): self.language = language self.tt_home = tt_home self.tokenizer = Tokenizer(language) self.tagger = TreeTagger( TAGLANG=language, TAGDIR=tt_home, # Explicit TAGOPT: the default has the '-no-unknown' option, # which prints the token rather than '<unknown>' for unknown lemmas # We'd rather skip unknown lemmas, as they are likely to be wrong tags TAGOPT=u'-token -lemma -sgml -quiet', # Use our tokenization logic (CHUNKERPROC here) CHUNKERPROC=self._tokenizer_wrapper, **kwargs ) def _tokenizer_wrapper(self, tagger, text_list): """ Wrap the tokenization logic with the signature required by the TreeTagger CHUNKERPROC kwarg """ tokens = [] for text in text_list: for token in self.tokenizer.tokenize(text): tokens.append(token) return tokens def _postprocess_tags(self, tags, skip_unknown=True): """ Clean tagged data from non-tags and unknown lemmas (optionally) """ clean_tags = [] for tag in tags: if skip_unknown and isinstance(tag, NotTag) or tag.lemma == u'<unknown>': logger.debug("Unknown lemma found: %s. Skipping ..." % repr(tag)) continue clean_tags.append(tag) return clean_tags def tokenize(self, text): """ Splits a text into tokens """ return self.tokenizer.tokenize(text) def tag_one(self, text, skip_unknown=True, **kwargs): """ POS-Tags the given text, optionally skipping unknown lemmas :param unicode text: Text to be tagged :param bool skip_unknown: Automatically emove unrecognized tags from the result Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj')) [Tag(word=u'sample', pos=u'NN', lemma=u'sample'), Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'), Tag(word=u'to', pos=u'TO', lemma=u'to'), Tag(word=u'be', pos=u'VB', lemma=u'be'), Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')] """ return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)), skip_unknown) def tag_many(self, items, document_key, pos_tag_key, batch_size=10000, **kwargs): """ POS-Tags many text documents of the given items. Use this for massive text tagging :param items: Iterable of items to tag. Generator preferred :param document_key: Where to find the text to tag inside each item. Text must be unicode :param pos_tag_key: Where to put pos tagged text Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(list(TTPosTagger('en').tag_many( ... [{'text': u'Item one is in first position'}, {'text': u'In the second position is item two'}], ... 'text', 'tagged' ... ))) [{'tagged': [Tag(word=u'Item', pos=u'NN', lemma=u'item'), Tag(word=u'one', pos=u'CD', lemma=u'one'), Tag(word=u'is', pos=u'VBZ', lemma=u'be'), Tag(word=u'in', pos=u'IN', lemma=u'in'), Tag(word=u'first', pos=u'JJ', lemma=u'first'), Tag(word=u'position', pos=u'NN', lemma=u'position')], 'text': u'Item one is in first position'}, {'tagged': [Tag(word=u'In', pos=u'IN', lemma=u'in'), Tag(word=u'the', pos=u'DT', lemma=u'the'), Tag(word=u'second', pos=u'JJ', lemma=u'second'), Tag(word=u'position', pos=u'NN', lemma=u'position'), Tag(word=u'is', pos=u'VBZ', lemma=u'be'), Tag(word=u'item', pos=u'RB', lemma=u'item'), Tag(word=u'two', pos=u'CD', lemma=u'two')], 'text': u'In the second position is item two'}] """ tt_pool = TaggerProcessPoll( TAGLANG=self.language, TAGDIR=self.tt_home, TAGOPT=u'-token -lemma -sgml -quiet', CHUNKERPROC=self._tokenizer_wrapper ) logging.getLogger('TreeTagger').setLevel(logging.WARNING) try: jobs = [] for i, item in enumerate(items): if not item.get(document_key): continue jobs.append((item, tt_pool.tag_text_async(item[document_key], **kwargs))) if i % batch_size == 0: for each in self._finalize_batch(jobs, pos_tag_key): yield each jobs = [] for each in self._finalize_batch(jobs, pos_tag_key): yield each finally: tt_pool.stop_poll() def _finalize_batch(self, jobs, pos_tag_key): for item, job in jobs: job.wait_finished() item[pos_tag_key] = self._postprocess_tags(make_tags(job.result)) yield item
#1er tag avant le lower pour repérer les noms propres preTokens = nltk.word_tokenize(document) prePosTokens = pos_tagger.tag(preTokens) tokens = [] for i in range(len(prePosTokens)): if prePosTokens[i][1] == 'NPP': tokens.append(prePosTokens[i][0]) else: tokens.append((prePosTokens[i][0]).lower()) #Lemmatisation avec TreeTagger import treetaggerwrapper from treetaggerwrapper import TreeTagger tagger = TreeTagger(TAGLANG='fr') tags = tagger.tag_text(tokens) lemmas = [t[2] for t in treetaggerwrapper.make_tags(tags)] #Filtre alphanumérique alphaTokens = [t for t in lemmas if re.match("^[A-Za-z -]+$", t)] #Filtre stopwords filteredLemmas = [t for t in alphaTokens if t not in stopFrench] #print(filteredLemmas) filteredText = nltk.Text(filteredLemmas) fdistFiltered = nltk.FreqDist(filteredText) filteredLemmasTaged = {} #dictionnaire for i in range(len(filteredLemmas)): filteredLemmasTaged[filteredLemmas[i]] = pos_tagger.tag( [filteredLemmas[i]])[0][1]
cat = i.get('class').strip().replace(" ", "_") sub = i.get('subclass').strip().replace(" ", "_") ID = i.get('id') if not os.path.exists(mydir + "/" + source + "/" + cat + "/" + sub + "/" + ID): os.makedirs(mydir + "/" + source + "/" + cat + "/" + sub + "/" + ID) ID_q = 1 ID_a = 1 for q in i.findall("./question"): with open(mydir + "/" + source + "/" + cat + "/" + sub + "/" + ID + "/" + ID + "_q" + str(ID_q) + ".conll", "w", encoding="utf-8") as question: for line in q.text.split("\n"): #os.system('echo "'+line.strip().replace('"','\"')+'" | tree-tagger-french >>'+mydir+"/"+source+"/"+cat+"/"+sub+"/"+ID+"/"+ID+"_q"+str(ID_q)+".conll") tags = TAGGER.tag_text(line) for tag in tags: question.write(tag + "\n") question.write("\n") #print(q.text.strip().replace('"','\"')) phrases.write(q.text.strip()) ID_q += 1 for a in i.findall("./answer"): with open(mydir + "/" + source + "/" + cat + "/" + sub + "/" + ID + "/" + ID + "_a" + str(ID_q) + ".conll", "w", encoding="utf-8") as answer: for line in a.text.split("\n"): #os.system('echo "'+line.strip().replace('"','\"')+'" | tree-tagger-french >>'+mydir+"/"+source+"/"+cat+"/"+sub+"/"+ID+"/"+ID+"_q"+str(ID_q)+".conll") tags = TAGGER.tag_text(line)
class TTPosTagger(object): """ part-of-speech tagger implemented using tree tagger and treetaggerwrapper """ def __init__(self, language, tt_home=None, **kwargs): self.language = language self.tt_home = tt_home self.tokenizer = Tokenizer(language) self.tagger = TreeTagger( TAGLANG=language, TAGDIR=tt_home, # Explicit TAGOPT: the default has the '-no-unknown' option, # which prints the token rather than '<unknown>' for unknown lemmas # We'd rather skip unknown lemmas, as they are likely to be wrong tags TAGOPT=u'-token -lemma -sgml -quiet', # Use our tokenization logic (CHUNKERPROC here) CHUNKERPROC=self._tokenizer_wrapper, **kwargs) def _tokenizer_wrapper(self, tagger, text_list): """ Wrap the tokenization logic with the signature required by the TreeTagger CHUNKERPROC kwarg """ tokens = [] for text in text_list: for token in self.tokenizer.tokenize(text): tokens.append(token) return tokens def _postprocess_tags(self, tags, skip_unknown=True): """ Clean tagged data from non-tags and unknown lemmas (optionally) """ clean_tags = [] for tag in tags: if skip_unknown and isinstance( tag, NotTag) or tag.lemma == u'<unknown>': logger.debug("Unknown lemma found: %s. Skipping ..." % repr(tag)) continue clean_tags.append(tag) return clean_tags def tokenize(self, text): """ Splits a text into tokens """ return self.tokenizer.tokenize(text) def tag_one(self, text, skip_unknown=True, **kwargs): """ POS-Tags the given text, optionally skipping unknown lemmas :param unicode text: Text to be tagged :param bool skip_unknown: Automatically emove unrecognized tags from the result Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj')) [Tag(word=u'sample', pos=u'NN', lemma=u'sample'), Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'), Tag(word=u'to', pos=u'TO', lemma=u'to'), Tag(word=u'be', pos=u'VB', lemma=u'be'), Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')] """ return self._postprocess_tags( make_tags(self.tagger.tag_text(text, **kwargs)), skip_unknown) def tag_many(self, items, document_key, pos_tag_key, batch_size=10000, **kwargs): """ POS-Tags many text documents of the given items. Use this for massive text tagging :param items: Iterable of items to tag. Generator preferred :param document_key: Where to find the text to tag inside each item. Text must be unicode :param pos_tag_key: Where to put pos tagged text Sample usage: >>> from strephit.commons.pos_tag import TTPosTagger >>> from pprint import pprint >>> pprint(list(TTPosTagger('en').tag_many( ... [{'text': u'Item one is in first position'}, {'text': u'In the second position is item two'}], ... 'text', 'tagged' ... ))) [{'tagged': [Tag(word=u'Item', pos=u'NN', lemma=u'item'), Tag(word=u'one', pos=u'CD', lemma=u'one'), Tag(word=u'is', pos=u'VBZ', lemma=u'be'), Tag(word=u'in', pos=u'IN', lemma=u'in'), Tag(word=u'first', pos=u'JJ', lemma=u'first'), Tag(word=u'position', pos=u'NN', lemma=u'position')], 'text': u'Item one is in first position'}, {'tagged': [Tag(word=u'In', pos=u'IN', lemma=u'in'), Tag(word=u'the', pos=u'DT', lemma=u'the'), Tag(word=u'second', pos=u'JJ', lemma=u'second'), Tag(word=u'position', pos=u'NN', lemma=u'position'), Tag(word=u'is', pos=u'VBZ', lemma=u'be'), Tag(word=u'item', pos=u'RB', lemma=u'item'), Tag(word=u'two', pos=u'CD', lemma=u'two')], 'text': u'In the second position is item two'}] """ try: tt_pool = TaggerProcessPoll(TAGLANG=self.language, TAGDIR=self.tt_home, TAGOPT=u'-token -lemma -sgml -quiet', CHUNKERPROC=self._tokenizer_wrapper) except TypeError: logger.warn( 'failed to initialize tree tragger process pool, fallback to single-process tagging' ) for each in items: text = each.get(document_key) if text: each[pos_tag_key] = self.tag_one(text, **kwargs) yield each else: logging.getLogger('TreeTagger').setLevel(logging.WARNING) try: jobs = [] s = 0 for i, item in enumerate(items): if not item.get(document_key): s += 1 continue jobs.append( (item, tt_pool.tag_text_async(item[document_key], **kwargs))) if len(jobs) % batch_size == 0: for each in self._finalize_batch(jobs, pos_tag_key): yield each jobs = [] for each in self._finalize_batch(jobs, pos_tag_key): yield each finally: tt_pool.stop_poll() def _finalize_batch(self, jobs, pos_tag_key): for item, job in jobs: job.wait_finished() item[pos_tag_key] = self._postprocess_tags(make_tags(job.result)) yield item