Ejemplo n.º 1
0
def desc2domaine(description_cas, dom_logement=1, dom_famille=9):
    """Classifieur: détermine si la description appartient au droit de la
    famille ou au droit du logement.
    TODO: remplacer par une fonction qui utilise les doc2vec (je fais pas trop
    confiance au nearest neighbor — c'est presque toujours le pire classifieur)
    """

    ttroot = os.path.abspath(os.path.join(os.getcwd(), "treetagger-install"))

    tagger = TreeTagger(
        TAGLANG="fr",
        TAGDIR=ttroot
    )
    v = np.zeros(len(mots))
    t = [ ln.split("\t") for ln in tagger.tag_text(description_cas) ]
    t = [ i[2] for i in t if len(i)==3 ]
    t = [ i for i in t for i in mots ]

    nmots = float(len(t))

    for k, val in Counter(t).items():
        v[mots.index(k)] = val / nmots

    dfamille = cosine(v, vec["famille"])
    dlogement = cosine(v, vec["logement"])

    if dlogement < dfamille:
        return dom_logement
    else:
        return dom_famille
Ejemplo n.º 2
0
def tag(text, tt_home):
    # Default NLTK's tokenizer
    # TreebankWordTokenizer + PunktSentenceTokenizer
    nltk_start = time()
    tokens = word_tokenize(text)
    # Default NLTK's POS tagger
    # ?
    # Use tagset='universal' for universal tagset
    nltk_tagged = pos_tag(tokens)
    nltk_end = time()
    nltk_execution = nltk_end - nltk_start
    logger.info("NLTK took %f seconds" % nltk_execution)

    # TreeTagger wrapper
    # Tokenization: ?
    # Default language: English
    # English: trained on Penn treebank
    # Default flags: -token -lemma -sgml -quiet -no-unknown
    tt_start = time()
    tt = TreeTagger(TAGDIR=tt_home)
    raw_tags = tt.tag_text(text)
    tt_end = time()
    tt_execution = tt_end - tt_start
    tt_tagged = make_tags(raw_tags)
    logger.info("TreeTagger took %f seconds" % tt_execution)
    return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
 def __init__(self, auteur, numero, langue = "fr"):
     """Crée l'objet Oeuvre s'il n'existe pas encore et le sauvegarde dans un fichier du même nom. S'il existe déjà, on le reprend simplement dans le fichier."""
     self.auteur = auteur
     self.numero = numero
     self.langue = langue
     self.categorie = None
     emplacement_textes = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers txt/"
     emplacement_oeuvres = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers oeuvres/"
     #self.infos = Infos(auteur,numero)
     print(auteur + str(numero), end = " ")
     try:
         with open(emplacement_oeuvres + auteur + str(numero), "rb") as mon_fichier:
             o = pickle.load(mon_fichier)
         self.texte_brut = o.texte_brut
         self.tags = o.tags
         self.mots = o.mots
         self.racines = o.racines
         self.POS = o.POS
         print("(importation terminee)", end = " / ")
     except FileNotFoundError:
         tagger = TreeTagger(TAGLANG = self.langue)
         self.texte_brut = formater(importer(auteur, numero,emplacement_textes))
         self.tags = make_tags(tagger.tag_text(self.texte_brut))
         self.mots = [t[0] for t in self.tags if len(t) == 3]
         self.racines = [t[2] for t in self.tags if len(t) == 3]
         self.POS = [t[1] for t in self.tags if len(t) == 3]
         with open(emplacement_oeuvres + "/" + auteur + str(numero), "wb") as mon_fichier:
             pickle.dump(self,mon_fichier,protocol = 2)
         print("(creation terminee)", end = " / ")
Ejemplo n.º 4
0
def build_stemed_day_array(df, date_range, stop_words, lang):

    # init TreeTagger
    tagger = TreeTagger(TAGLANG=lang, TAGDIR='./TreeTagger')

    res = []
    # iterating over all days in the given day range
    for i in tqdm(date_range, desc='Stemming Tweets'):
        sentence = []

        # selecting all tweets of a certain day
        for tweet in df[np.logical_and(
                df['date'] > i, df['date'] < i + pd.DateOffset(1))]['text']:

            tweet = format_text(tweet)

            # if the tweet has content left after cleaning, lemmatization will start
            if tweet != '':
                for word in [j.split('\t')[2] for j in tagger.tag_text(tweet)]:
                    if len(word) < 3:
                        continue
                    if word.lower() in stop_words:
                        continue

                    # +,| and @ are added by TreeTagger for multiple meanings or composed words.
                    # Will be ignored to prevend impurities
                    if "+" in word or "|" in word or "@" in word:
                        continue

                    sentence.append(word)
        res.append(" ".join(sentence))
    return res
Ejemplo n.º 5
0
def tag(text, tt_home):
    # Default NLTK's tokenizer
    # TreebankWordTokenizer + PunktSentenceTokenizer
    nltk_start = time()
    tokens = word_tokenize(text)
    # Default NLTK's POS tagger
    # ?
    # Use tagset='universal' for universal tagset
    nltk_tagged = pos_tag(tokens)
    nltk_end = time()
    nltk_execution = nltk_end - nltk_start
    logger.info("NLTK took %f seconds" % nltk_execution)

    # TreeTagger wrapper
    # Tokenization: ?
    # Default language: English
    # English: trained on Penn treebank
    # Default flags: -token -lemma -sgml -quiet -no-unknown
    tt_start = time()
    tt = TreeTagger(TAGDIR=tt_home)
    raw_tags = tt.tag_text(text)
    tt_end = time()
    tt_execution = tt_end - tt_start
    tt_tagged = make_tags(raw_tags)
    logger.info("TreeTagger took %f seconds" % tt_execution)
    return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
Ejemplo n.º 6
0
class OOVclassifier(object):
    def __init__(self, stem=False):
        dictionaries = dicts()
        path = '/home/alangb/TWPP'  # path to TreeTagger installation directory
        self.english_dict = enchant.Dict("en_EN")
        self.spanish_dict = enchant.Dict("es_ES")
        self.ND = dictionaries.norm
        self.SD = dictionaries.lemario
        self.PND = dictionaries.names
        self.stem = stem
        if stem:
            self.stemmer = SpanishStemmer()
        else:
            self.tagger = TreeTagger(TAGLANG='es', TAGDIR=path)

    def dictionary_lookup(self, word):
        result = (word in self.SD or word in self.PND
                  or word in self.ND.values())
        return result

    def affix_check(self, word):
        result = False
        if word.islower() or word.istitle():
            if self.stem:
                n = len(word)
                stem = self.stemmer.stem(word)
                # compare with first substring of length n of each word in SD
                for w in [x[:n] for x in self.SD if len(x) >= n]:
                    result = (word == w)
                    if result:
                        break
            else:
                lemma = make_tags(self.tagger.tag_text(word))[0].lemma
                result = self.dictionary_lookup(lemma)
        return result

    def check(self, word):
        result = self.spanish_dict.check(word)
        if not result:
            result = self.dictionary_lookup(word) or self.affix_check(word)
        return result

    def check_NoES(self, word):
        result = False
        if len(word) > 1:
            result = self.english_dict.check(word)
        return result

    def classify(self, word):
        if self.check(word):
            result = 1
        elif self.check_NoES(word):
            result = 2
        else:
            result = 0
        return result
Ejemplo n.º 7
0
def tagText(doc, lg):
    """
    permet de tagger du texte avec TreeTagger
    attend une chaîne de caractères
    renvoie une liste d'éléments taggés sous ce format : ['je\tPRO:PER\tje', 'suis\tVER:pres\tsuivre|être']
    """
    assert doc, "Problème : l'élément à tagger est vide"
    tag_options = TreeTagger(TAGLANG=lg,
                             TAGOPT="-token -lemma -sgml -no-unknown")
    tags = tag_options.tag_text(doc)
    return tags
Ejemplo n.º 8
0
def text2vec(description_cas):
    "Convertit du texte en vecteur sur l'espace du modèle Doc2Vec d2v"

    ttroot = os.path.abspath(os.path.join(os.getcwd(), "treetagger-install"))

    tagger = TreeTagger(TAGLANG="fr", TAGDIR=ttroot)
    t = [ ln.split("\t") for ln in tagger.tag_text(description_cas.lower()) ]
    t = [ i[2] for i in t if len(i)==3 ]
    t = [ i for i in t for i in d2v.wv.index2entity ]

    return d2v.infer_vector(t)
Ejemplo n.º 9
0
 def add_pos_tags(self, treetaggerlocation, taglang="es"):
     # POS-tag the tokenized sentences according to language
     print("Adding POS-Tags...")
     pos_tagged_sents = pos_tag_sents(self.tokenizedLines_en)
     # pos_tagged_sents = [[(w1,tag1),(w2,tag2)],[(w1,t2),(w2,t2),...]...]
     for sent in pos_tagged_sents:
         fo = []
         for word in sent:
             temp = word[0] + word[1]
             fo.append(temp)
         self.pos_tagged_sents_en.append(
             fo)  # -> [["w1t1","w2t2",...],["w1t1",...],...]
     if taglang == "fi":
         tagger = TreeTagger(TAGLANG=taglang,
                             TAGDIR=treetaggerlocation,
                             TAGPARFILE="Preprocessing/fi_en/finnish.par")
     if taglang == "es":
         tagger = TreeTagger(TAGLANG=taglang,
                             TAGDIR=treetaggerlocation,
                             TAGPARFILE="Preprocessing/es_en/spanish.par")
     if taglang == "de":
         tagger = TreeTagger(TAGLANG=taglang,
                             TAGDIR=treetaggerlocation,
                             TAGPARFILE="Preprocessing/de_en/german.par")
     if taglang == "pl":
         tagger = TreeTagger(TAGLANG=taglang,
                             TAGDIR=treetaggerlocation,
                             TAGPARFILE="Preprocessing/pl_en/polish.par")
     pos_tagged_sents = []
     for line in self.tokenizedLines_es:
         if "EE.UU" in line:
             line_t = []
             for w in line:
                 if w == "EE.UU":
                     line_t.append("EEUU")
                 else:
                     line_t.append(w)
             line = line_t
         tags = tagger.tag_text(line)
         pos_tagged_sents.append(tags)
         # -> [['Esto\tDM\teste','es\tVSfin\tser', 'un\tART\tun','texto\tNC\ttexto',],[...],...]
     for i in range(len(pos_tagged_sents)):
         fo = []
         for word in pos_tagged_sents[i]:
             temp = word.split(
                 '\t')  # 'esto\tDM\teste' => ['esto', 'DM', 'este']
             word_n_tag = temp[0] + temp[
                 1]  # ['esto', 'DM', 'este'] => 'estoDM'
             fo.append(word_n_tag)
         self.pos_tagged_sents_es.append(fo)
Ejemplo n.º 10
0
def scraping():
    """Si l'utilisateur n'a pas demandé un scraping, recherche de documents du pays sélectionné
	dans la base de données; ces documents et leurs liens vers les photos seront renvoyés.
	Si l'utilisateur a demandé un scraping, ou s'il n'y a pas ou pas assez de documents du pays
	sélectionné dans la base de données, configuration et lancement du scrape sur Reddit, puis
	étiquetage des titres des soumissions résultats par TreeTagger, et analyse des étiquettes
	pour obtenir une liste de lieux potentiels.
	Ces lieux sont recherchés sur geonames. Les résultats de cette dernière recherche sont chargés
	dans deux dictionnaires, l'un pour l'affichage des photos sur le site et l'autre pour stocker
	les résultats dans la base de données sur mongoDB.
	NB: le scraping tente toujours d'obtenir de nouvelles photos (absentes de mongoDB).
	"""

    #Configuration Geoscape
    geokey = current_app.config['GEOKEY']
    geoauth = current_app.config['GEOAUTH']

    #Paramètres de la requête Javascript
    rgnversion = request.args.get('search_version')
    country = request.args.get('country')
    country_code = request.args.get('country_code')
    limit = int(request.args.get('nombre_image'))
    scrape_requested = True if request.args.get(
        'scraping') == 'true' else False

    #Dico de résultats pour l'affichage sur le site
    search_res = geo.GeoQuery(geokey, geoauth, country, country_code, 'E')
    dic_results = {
        'head': {
            'total': 0,
            'country': {
                'name': country,
                'lng': search_res.result.lng,
                'lat': search_res.result.lat
            }
        },
        'results': []
    }
    #Liste de chargement pour la base de données
    database_list = []

    if scrape_requested:  #On ne charge que les img_url
        load_arg = {'img_url': 1, '_id': 0}
    else:  #On charge le document pour l'affichage
        load_arg = {
            'scraped_title': 0,
            'location_list': 0,
            'feature_class': 0,
            'testers': 0,
            '_id': 0
        }

    existing_urls = []
    check_db = mongo.Mongo.mongocheck('Resultats_RGN')

    #Initialisation de la collection des résultats si elle n'existe pas
    if not check_db:
        dbstart = mongo.MongoSave([{
            'key':
            'Initialisation de la collection Resultats_RGN.'
        }])
        dbstart.storeindb('Resultats_RGN', img_url='A', search_version='D')
        dbstart.nonunique_index('Resultats_RGN',
                                country='A',
                                search_version='D')

    #Les documents pris dans la base de données sont chargés dans le dictionnaire de résultats
    else:
        dbfinder = mongo.MongoLoad(
            {
                'search_version': rgnversion,
                'country': country
            }, load_arg)
        for doc in dbfinder.retrieve('Resultats_RGN', limit=limit):
            if not scrape_requested:
                dic_results['head']['total'] += 1
                dic_results['results'].append(doc)
            existing_urls.append('-url:' + doc['img_url'])

    if scrape_requested or dic_results['head']['total'] < limit:
        #Configuration recherche reddit, profil chargé depuis praw.ini
        reddit = praw.Reddit('current_user')

        target_sub = reddit.subreddit('EarthPorn')
        query = country if country != 'United States' else 'USA'
        print(
            '\033[92m' + target_sub.display_name + '\033[0m'
            '\nRésultats de recherche pour les soumissions reddit avec:',
            query, '\n')

        #Exclure les documents déjà récupérés
        user_limit = limit

        if len(query) + len(existing_urls) + sum(
                len(url) for url in existing_urls) <= 512:
            query += (' ' + ' '.join(existing_urls)).rstrip()
            limit -= dic_results['head']['total']
        else:  #512 caractères max dans une requête Reddit
            limit = 1000  #Max permis par Reddit

        existing_urls = [url[5:] for url in existing_urls]

        #Config TreeTagger. Le dossier Treetagger doit être dans le dossier d'où le programme est exécuté
        if sys.platform.startswith('linux'):
            reddit_tagger = TreeTagger(TAGLANG='en',
                                       TAGDIR=join(getcwd(), 'Treetagger',
                                                   'TreeTagger_unix'))
        elif sys.platform.startswith('win'):
            reddit_tagger = TreeTagger(TAGLANG='en',
                                       TAGDIR=join(getcwd(), 'Treetagger',
                                                   'TreeTagger_windows'))
        else:
            sys.exit('Système d\'exploitation non compatible avec Geoscape.')

        #Résultats de la recherche dans le subreddit
        test_posts = target_sub.search(query, limit=limit)

        for post in test_posts:
            try:
                attempt = post.url
            except prawcore.exceptions.NotFound:
                continue  #Problème avec la photo; éliminé

            if post.url in existing_urls:
                continue  #Déjà stocké dans la base de données; éliminé

            if search('\W' + country + '\W',
                      post.title):  #Pays comme mot distinct
                #Saute aux plus une fois des caractères entre [] ou () au début du texte et s'arrête au premier [ ou (
                res = search('^(?:[\[(].*[\])])?([^\[(]+)', post.title)
                if (res):
                    print(res.group(1))

                    #Tagging: génère une liste de triplets: (word=..., pos=..., lemma=...)
                    reddit_tags = make_tags(reddit_tagger.tag_text(
                        res.group(1)),
                                            exclude_nottags=True)

                    #Le nom du pays est exclu des lieux potentiels; rajouté seulement en dernier recours
                    country_split = country.casefold().split(' ')
                    size = len(country_split)
                    indexes = []
                    if size > 1:
                        name_tags = [t[0].casefold() for t in reddit_tags]
                        for window in enumerate(windowed(name_tags, size)):
                            if all(window[1][i] == country_split[i]
                                   for i in range(size)):
                                indexes.extend([
                                    i
                                    for i in range(window[0], window[0] + size)
                                ])

                    for index, tag in enumerate(reddit_tags):
                        if tag[1] == 'NP':  #Tag nom propre sous Windows
                            reddit_tags[index] = (tag[0], 'NP0', tag[2])
                        if tag[0].casefold() == country.casefold(
                        ) or index in indexes:
                            reddit_tags[index] = (tag[0], 'CTY', tag[2])
                    pprint(reddit_tags)

                    #Recherche des lieux potentiels, avec stocké entre les lieux le nombre de mots non choisis
                    location_list = location_finder(country, rgnversion,
                                                    reddit_tags)

                    print('Lieux trouvés:', end='')
                    print(location_list, '\n')

                    #Geonames
                    date = gmtime(post.created_utc)
                    dic_mongo = {
                        'link': 'https://www.reddit.com' + post.permalink,
                        'img_url': post.url,
                        'search_version': rgnversion,
                        'country': country,
                        'country_code': country_code,
                        'scraped_title': res.group(1).strip(),
                        'text': post.title,
                        'tag_list': reddit_tags,
                        'location_list': location_list,
                        'date': {
                            'year': date.tm_year,
                            'month': date.tm_mon,
                            'day': date.tm_mday,
                            'hour': date.tm_hour,
                            'min': date.tm_min,
                            'sec': date.tm_sec
                        }
                    }

                    try:
                        attempt = post.author.icon_img
                    except prawcore.exceptions.NotFound:
                        pass
                    else:
                        dic_mongo['author'] = {
                            'name':
                            post.author.name,
                            'icon':
                            post.author.icon_img,
                            'profile':
                            'https://www.reddit.com/user/' + post.author.name
                        }
                    """ R: recherche standard
						RF: recherche fuzzy
						E: recherche exacte
						EH: recherche exacte sur ensembles humains
						EN: recherche exacte sur ensembles naturels
					"""

                    placefinder = geo.LocationList(country_code, location_list)
                    geo_res = placefinder.geo_search(geokey, geoauth, 'EN EH',
                                                     'R')  #Objet GeoQuery

                    #En dernier recours, le pays lui-même s'il est dans le titre
                    if geo_res.result is None and country in res.group(1):
                        placefinder.reinit(country_code, [country])
                        geo_res = placefinder.geo_search(geokey, geoauth, 'E')

                    if geo_res.result is not None:
                        dic_results['head']['total'] += 1
                        print('Résultat GeoNames:',
                              geo_res.result.address,
                              end='')
                        print('. Après', placefinder.counter, 'requêtes.')

                        dic_mongo['name'] = geo_res.result.address  #Nom
                        dic_mongo['lng'] = geo_res.result.lng
                        dic_mongo['lat'] = geo_res.result.lat
                        dic_mongo[
                            'feature_class'] = geo_res.result.feature_class
                        dic_mongo['location'] = geo_res.location

                        dic_results['results'].append(dic_mongo)

                        dic_tostore = deepcopy(dic_mongo)
                        database_list.append(dic_tostore)

                        user_limit -= 1
                        if not user_limit:
                            break

                print('\n###############')

        #Chargement dans la base de données des documents générés par le scrape
        documents = mongo.MongoSave(database_list)
        documents.storeindb('Resultats_RGN')

    return jsonify(dic_results)
Ejemplo n.º 11
0
class TTPosTagger(object):
    """ part-of-speech tagger implemented using tree tagger and treetaggerwrapper """

    def __init__(self, language, tt_home=None, **kwargs):
        self.language = language
        self.tt_home = tt_home
        self.tokenizer = Tokenizer(language)
        self.tagger = TreeTagger(
            TAGLANG=language,
            TAGDIR=tt_home,
            # Explicit TAGOPT: the default has the '-no-unknown' option,
            # which prints the token rather than '<unknown>' for unknown lemmas
            # We'd rather skip unknown lemmas, as they are likely to be wrong tags
            TAGOPT=u'-token -lemma -sgml -quiet',
            # Use our tokenization logic (CHUNKERPROC here)
            CHUNKERPROC=self._tokenizer_wrapper,
            **kwargs
        )

    def _tokenizer_wrapper(self, tagger, text_list):
        """ Wrap the tokenization logic with the signature required by the TreeTagger CHUNKERPROC kwarg
        """
        tokens = []
        for text in text_list:
            for token in self.tokenizer.tokenize(text):
                tokens.append(token)
        return tokens

    def _postprocess_tags(self, tags, skip_unknown=True):
        """ Clean tagged data from non-tags and unknown lemmas (optionally) """
        clean_tags = []
        for tag in tags:
            if skip_unknown and isinstance(tag, NotTag) or tag.lemma == u'<unknown>':
                logger.debug("Unknown lemma found: %s. Skipping ..." % repr(tag))
                continue
            clean_tags.append(tag)
        return clean_tags

    def tokenize(self, text):
        """ Splits a text into tokens
        """
        return self.tokenizer.tokenize(text)

    def tag_one(self, text, skip_unknown=True, **kwargs):
        """ POS-Tags the given text, optionally skipping unknown lemmas

            :param unicode text: Text to be tagged
            :param bool skip_unknown: Automatically emove unrecognized tags from the result

            Sample usage:

            >>> from strephit.commons.pos_tag import TTPosTagger
            >>> from pprint import pprint
            >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj'))
            [Tag(word=u'sample', pos=u'NN', lemma=u'sample'),
             Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'),
             Tag(word=u'to', pos=u'TO', lemma=u'to'),
             Tag(word=u'be', pos=u'VB', lemma=u'be'),
             Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')]
        """
        return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)),
                                      skip_unknown)

    def tag_many(self, items, document_key, pos_tag_key, batch_size=10000, **kwargs):
        """ POS-Tags many text documents of the given items. Use this for massive text tagging

            :param items: Iterable of items to tag. Generator preferred
            :param document_key: Where to find the text to tag inside each item. Text must be unicode
            :param pos_tag_key: Where to put pos tagged text

            Sample usage:

            >>> from strephit.commons.pos_tag import TTPosTagger
            >>> from pprint import pprint
            >>> pprint(list(TTPosTagger('en').tag_many(
            ...     [{'text': u'Item one is in first position'}, {'text': u'In the second position is item two'}],
            ...     'text', 'tagged'
            ... )))
            [{'tagged': [Tag(word=u'Item', pos=u'NN', lemma=u'item'),
                         Tag(word=u'one', pos=u'CD', lemma=u'one'),
                         Tag(word=u'is', pos=u'VBZ', lemma=u'be'),
                         Tag(word=u'in', pos=u'IN', lemma=u'in'),
                         Tag(word=u'first', pos=u'JJ', lemma=u'first'),
                         Tag(word=u'position', pos=u'NN', lemma=u'position')],
              'text': u'Item one is in first position'},
             {'tagged': [Tag(word=u'In', pos=u'IN', lemma=u'in'),
                         Tag(word=u'the', pos=u'DT', lemma=u'the'),
                         Tag(word=u'second', pos=u'JJ', lemma=u'second'),
                         Tag(word=u'position', pos=u'NN', lemma=u'position'),
                         Tag(word=u'is', pos=u'VBZ', lemma=u'be'),
                         Tag(word=u'item', pos=u'RB', lemma=u'item'),
                         Tag(word=u'two', pos=u'CD', lemma=u'two')],
              'text': u'In the second position is item two'}]
        """
        tt_pool = TaggerProcessPoll(
            TAGLANG=self.language,
            TAGDIR=self.tt_home,
            TAGOPT=u'-token -lemma -sgml -quiet',
            CHUNKERPROC=self._tokenizer_wrapper
        )
        logging.getLogger('TreeTagger').setLevel(logging.WARNING)
        try:
            jobs = []
            for i, item in enumerate(items):
                if not item.get(document_key):
                    continue

                jobs.append((item, tt_pool.tag_text_async(item[document_key], **kwargs)))
                if i % batch_size == 0:
                    for each in self._finalize_batch(jobs, pos_tag_key):
                        yield each
                    jobs = []
            for each in self._finalize_batch(jobs, pos_tag_key):
                yield each
        finally:
            tt_pool.stop_poll()

    def _finalize_batch(self, jobs, pos_tag_key):
        for item, job in jobs:
            job.wait_finished()
            item[pos_tag_key] = self._postprocess_tags(make_tags(job.result))
            yield item
Ejemplo n.º 12
0
#1er tag avant le lower pour repérer les noms propres
preTokens = nltk.word_tokenize(document)
prePosTokens = pos_tagger.tag(preTokens)

tokens = []
for i in range(len(prePosTokens)):
    if prePosTokens[i][1] == 'NPP':
        tokens.append(prePosTokens[i][0])
    else:
        tokens.append((prePosTokens[i][0]).lower())

#Lemmatisation avec TreeTagger
import treetaggerwrapper
from treetaggerwrapper import TreeTagger
tagger = TreeTagger(TAGLANG='fr')
tags = tagger.tag_text(tokens)
lemmas = [t[2] for t in treetaggerwrapper.make_tags(tags)]

#Filtre alphanumérique
alphaTokens = [t for t in lemmas if re.match("^[A-Za-z -]+$", t)]

#Filtre stopwords
filteredLemmas = [t for t in alphaTokens if t not in stopFrench]
#print(filteredLemmas)
filteredText = nltk.Text(filteredLemmas)
fdistFiltered = nltk.FreqDist(filteredText)

filteredLemmasTaged = {}  #dictionnaire
for i in range(len(filteredLemmas)):
    filteredLemmasTaged[filteredLemmas[i]] = pos_tagger.tag(
        [filteredLemmas[i]])[0][1]
Ejemplo n.º 13
0
    cat = i.get('class').strip().replace(" ", "_")
    sub = i.get('subclass').strip().replace(" ", "_")
    ID = i.get('id')
    if not os.path.exists(mydir + "/" + source + "/" + cat + "/" + sub + "/" +
                          ID):
        os.makedirs(mydir + "/" + source + "/" + cat + "/" + sub + "/" + ID)
    ID_q = 1
    ID_a = 1
    for q in i.findall("./question"):
        with open(mydir + "/" + source + "/" + cat + "/" + sub + "/" + ID +
                  "/" + ID + "_q" + str(ID_q) + ".conll",
                  "w",
                  encoding="utf-8") as question:
            for line in q.text.split("\n"):
                #os.system('echo "'+line.strip().replace('"','\"')+'" | tree-tagger-french >>'+mydir+"/"+source+"/"+cat+"/"+sub+"/"+ID+"/"+ID+"_q"+str(ID_q)+".conll")
                tags = TAGGER.tag_text(line)
                for tag in tags:
                    question.write(tag + "\n")
                question.write("\n")

        #print(q.text.strip().replace('"','\"'))
        phrases.write(q.text.strip())
        ID_q += 1
    for a in i.findall("./answer"):
        with open(mydir + "/" + source + "/" + cat + "/" + sub + "/" + ID +
                  "/" + ID + "_a" + str(ID_q) + ".conll",
                  "w",
                  encoding="utf-8") as answer:
            for line in a.text.split("\n"):
                #os.system('echo "'+line.strip().replace('"','\"')+'" | tree-tagger-french >>'+mydir+"/"+source+"/"+cat+"/"+sub+"/"+ID+"/"+ID+"_q"+str(ID_q)+".conll")
                tags = TAGGER.tag_text(line)
Ejemplo n.º 14
0
class TTPosTagger(object):
    """ part-of-speech tagger implemented using tree tagger and treetaggerwrapper """
    def __init__(self, language, tt_home=None, **kwargs):
        self.language = language
        self.tt_home = tt_home
        self.tokenizer = Tokenizer(language)
        self.tagger = TreeTagger(
            TAGLANG=language,
            TAGDIR=tt_home,
            # Explicit TAGOPT: the default has the '-no-unknown' option,
            # which prints the token rather than '<unknown>' for unknown lemmas
            # We'd rather skip unknown lemmas, as they are likely to be wrong tags
            TAGOPT=u'-token -lemma -sgml -quiet',
            # Use our tokenization logic (CHUNKERPROC here)
            CHUNKERPROC=self._tokenizer_wrapper,
            **kwargs)

    def _tokenizer_wrapper(self, tagger, text_list):
        """ Wrap the tokenization logic with the signature required by the TreeTagger CHUNKERPROC kwarg
        """
        tokens = []
        for text in text_list:
            for token in self.tokenizer.tokenize(text):
                tokens.append(token)
        return tokens

    def _postprocess_tags(self, tags, skip_unknown=True):
        """ Clean tagged data from non-tags and unknown lemmas (optionally) """
        clean_tags = []
        for tag in tags:
            if skip_unknown and isinstance(
                    tag, NotTag) or tag.lemma == u'<unknown>':
                logger.debug("Unknown lemma found: %s. Skipping ..." %
                             repr(tag))
                continue
            clean_tags.append(tag)
        return clean_tags

    def tokenize(self, text):
        """ Splits a text into tokens
        """
        return self.tokenizer.tokenize(text)

    def tag_one(self, text, skip_unknown=True, **kwargs):
        """ POS-Tags the given text, optionally skipping unknown lemmas

            :param unicode text: Text to be tagged
            :param bool skip_unknown: Automatically emove unrecognized tags from the result

            Sample usage:

            >>> from strephit.commons.pos_tag import TTPosTagger
            >>> from pprint import pprint
            >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj'))
            [Tag(word=u'sample', pos=u'NN', lemma=u'sample'),
             Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'),
             Tag(word=u'to', pos=u'TO', lemma=u'to'),
             Tag(word=u'be', pos=u'VB', lemma=u'be'),
             Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')]
        """
        return self._postprocess_tags(
            make_tags(self.tagger.tag_text(text, **kwargs)), skip_unknown)

    def tag_many(self,
                 items,
                 document_key,
                 pos_tag_key,
                 batch_size=10000,
                 **kwargs):
        """ POS-Tags many text documents of the given items. Use this for massive text tagging

            :param items: Iterable of items to tag. Generator preferred
            :param document_key: Where to find the text to tag inside each item. Text must be unicode
            :param pos_tag_key: Where to put pos tagged text

            Sample usage:

            >>> from strephit.commons.pos_tag import TTPosTagger
            >>> from pprint import pprint
            >>> pprint(list(TTPosTagger('en').tag_many(
            ...     [{'text': u'Item one is in first position'}, {'text': u'In the second position is item two'}],
            ...     'text', 'tagged'
            ... )))
            [{'tagged': [Tag(word=u'Item', pos=u'NN', lemma=u'item'),
                         Tag(word=u'one', pos=u'CD', lemma=u'one'),
                         Tag(word=u'is', pos=u'VBZ', lemma=u'be'),
                         Tag(word=u'in', pos=u'IN', lemma=u'in'),
                         Tag(word=u'first', pos=u'JJ', lemma=u'first'),
                         Tag(word=u'position', pos=u'NN', lemma=u'position')],
              'text': u'Item one is in first position'},
             {'tagged': [Tag(word=u'In', pos=u'IN', lemma=u'in'),
                         Tag(word=u'the', pos=u'DT', lemma=u'the'),
                         Tag(word=u'second', pos=u'JJ', lemma=u'second'),
                         Tag(word=u'position', pos=u'NN', lemma=u'position'),
                         Tag(word=u'is', pos=u'VBZ', lemma=u'be'),
                         Tag(word=u'item', pos=u'RB', lemma=u'item'),
                         Tag(word=u'two', pos=u'CD', lemma=u'two')],
              'text': u'In the second position is item two'}]
        """
        try:
            tt_pool = TaggerProcessPoll(TAGLANG=self.language,
                                        TAGDIR=self.tt_home,
                                        TAGOPT=u'-token -lemma -sgml -quiet',
                                        CHUNKERPROC=self._tokenizer_wrapper)
        except TypeError:
            logger.warn(
                'failed to initialize tree tragger process pool, fallback to single-process tagging'
            )
            for each in items:
                text = each.get(document_key)
                if text:
                    each[pos_tag_key] = self.tag_one(text, **kwargs)
                    yield each
        else:
            logging.getLogger('TreeTagger').setLevel(logging.WARNING)
            try:
                jobs = []
                s = 0
                for i, item in enumerate(items):
                    if not item.get(document_key):
                        s += 1
                        continue

                    jobs.append(
                        (item,
                         tt_pool.tag_text_async(item[document_key], **kwargs)))
                    if len(jobs) % batch_size == 0:
                        for each in self._finalize_batch(jobs, pos_tag_key):
                            yield each
                        jobs = []
                for each in self._finalize_batch(jobs, pos_tag_key):
                    yield each
            finally:
                tt_pool.stop_poll()

    def _finalize_batch(self, jobs, pos_tag_key):
        for item, job in jobs:
            job.wait_finished()
            item[pos_tag_key] = self._postprocess_tags(make_tags(job.result))
            yield item