def tok(tex):
    """
		Tag le texte et renvoie le texte taggé sous format facile à lire pour l'utilisateur en colonnes
		Entrée: texte
		Output: texte taggée
	"""
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
    tags = tagger.tag_text(tex)
    tags2 = treetaggerwrapper.make_tags(tags)
    pprint.pprint(tags2)
    empty = []
    for tag in tags2:
        tagg = tag
        empty.append(tagg)
        print(empty)

    grammar = []
    for element in empty:
        for i in element:
            #			if element.index(i)==0 or element.index(i)==1:
            grammar.append(i)
            grammar.append("\t")
        grammar.append("\n")
        res = "".join(grammar)

    return "{}".format(res)
Ejemplo n.º 2
0
def write_to_json(words, data, destination, tagger):
    data.setdefault("list", [])
    listItem = {}
    if len(destination) > 0:
        listItem["party"] = destination[0]
        listItem["lastName"] = destination[1]
        listItem["firstName"] = destination[2]
        listItem["year"] = destination[3]
        listItem["month"] = destination[4]
        listItem["day"] = destination[5][:-4]
        listItem.setdefault("words", [])
    else:
        listItem.setdefault("text", [])

    for item in words:
        y = {}
        word, count = item
        tag = tagger.tag_text(unicode(word))
        maketags=treetaggerwrapper.make_tags(tag)
        for item in maketags:
            y["tag"] = item[1]
        y["word"] = word
        y["count"] = count

        

        if len(destination) > 0:
            listItem["words"].append(y)
        else: listItem["text"].append(y)

    data["list"].append(listItem)
def _pos_tag(words: [str], lang: str):
    tagger = _get_tagger(lang)
    # we do our own chunking, so call the treetagger with a list of words instead
    tags_strs = tagger.tag_text(words, tagonly=True)
    return treetaggerwrapper.make_tags(tags_strs,
                                       exclude_nottags=False,
                                       allow_extra=True)
Ejemplo n.º 4
0
def tag(text, tt_home):
    # Default NLTK's tokenizer
    # TreebankWordTokenizer + PunktSentenceTokenizer
    nltk_start = time()
    tokens = word_tokenize(text)
    # Default NLTK's POS tagger
    # ?
    # Use tagset='universal' for universal tagset
    nltk_tagged = pos_tag(tokens)
    nltk_end = time()
    nltk_execution = nltk_end - nltk_start
    logger.info("NLTK took %f seconds" % nltk_execution)

    # TreeTagger wrapper
    # Tokenization: ?
    # Default language: English
    # English: trained on Penn treebank
    # Default flags: -token -lemma -sgml -quiet -no-unknown
    tt_start = time()
    tt = TreeTagger(TAGDIR=tt_home)
    raw_tags = tt.tag_text(text)
    tt_end = time()
    tt_execution = tt_end - tt_start
    tt_tagged = make_tags(raw_tags)
    logger.info("TreeTagger took %f seconds" % tt_execution)
    return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
Ejemplo n.º 5
0
    def fit_transform(self):

        # Parsing (lemmatisation and pos-tagging)
        tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
        df['tags'] = self.df[self.text_column].apply(
            lambda x: treetaggerwrapper.make_tags(tagger.tag_text(x)))
        df['lemma'] = df.tags.apply(lambda x: [(t.lemma).lower() if isinstance(
            t, treetaggerwrapper.Tag) else '' for t in x])
        df['text_lemma'] = df.apply(lambda row: " ".join(row.lemma), axis=1)
        df['pos'] = df.tags.apply(
            lambda x:
            [t.pos if isinstance(t, treetaggerwrapper.Tag) else '' for t in x])

        # surface based features
        df['number_verbs'] = df.pos.apply(lambda x: self.number_verbs(x))
        df['number_proper_nouns'] = df.pos.apply(
            lambda x: self.number_proper_nouns(x))
        df['number_imperative_verb'] = df.pos.apply(
            lambda x: self.number_imperative_verb(x))

        # sentiment features
        lex_dict = self.lex_df.to_dict('index')
        list_lex_exp = df.lemma.apply(lambda x: [
            lex_dict[key] if self.has_expression(exp=lex_dict[key]['lemma'],
                                                 string_ls=x,
                                                 group=lex_dict[key]['group'])
            == True else None for key in lex_dict.keys()
        ])
        list_lex_exp = list_lex_exp.apply(lambda x: list(filter(None, x)))
        has_int = self.has_intensifier(list_lex_exp)
        avg_pol = self.avg_polarity(list_lex_exp)

        df['has_intensifier'] = has_int
        df['avg_polarity'] = avg_pol
        return df
 def __init__(self, auteur, numero, langue = "fr"):
     """Crée l'objet Oeuvre s'il n'existe pas encore et le sauvegarde dans un fichier du même nom. S'il existe déjà, on le reprend simplement dans le fichier."""
     self.auteur = auteur
     self.numero = numero
     self.langue = langue
     self.categorie = None
     emplacement_textes = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers txt/"
     emplacement_oeuvres = emplacement_dossier_groupe + "Corpus/" + dico_langues[langue] + "/Fichiers oeuvres/"
     #self.infos = Infos(auteur,numero)
     print(auteur + str(numero), end = " ")
     try:
         with open(emplacement_oeuvres + auteur + str(numero), "rb") as mon_fichier:
             o = pickle.load(mon_fichier)
         self.texte_brut = o.texte_brut
         self.tags = o.tags
         self.mots = o.mots
         self.racines = o.racines
         self.POS = o.POS
         print("(importation terminee)", end = " / ")
     except FileNotFoundError:
         tagger = TreeTagger(TAGLANG = self.langue)
         self.texte_brut = formater(importer(auteur, numero,emplacement_textes))
         self.tags = make_tags(tagger.tag_text(self.texte_brut))
         self.mots = [t[0] for t in self.tags if len(t) == 3]
         self.racines = [t[2] for t in self.tags if len(t) == 3]
         self.POS = [t[1] for t in self.tags if len(t) == 3]
         with open(emplacement_oeuvres + "/" + auteur + str(numero), "wb") as mon_fichier:
             pickle.dump(self,mon_fichier,protocol = 2)
         print("(creation terminee)", end = " / ")
Ejemplo n.º 7
0
 def tagText(self, text):
     cleared_text = preprocessTweetText(text)
     tags = self.tagger.tag_text(cleared_text,
                                 notagemail=True,
                                 notagdns=True)
     tags2 = treetaggerwrapper.make_tags(tags)
     return tags2
Ejemplo n.º 8
0
def get_documents(docs, stopwords):
    """Extrait les documents du corpus
    :param corpus: [(source,datetime,text)]"""

    documents = list()
    corpus = [
        (doc[0], doc[1], doc[2]) for doc in docs if len(doc[2].split()) > 3
    ]  # supprime les lignes courtes de moins de 4 mots

    tagger = tagr.TreeTagger(
        TAGLANG='fr',
        TAGDIR='c:/Applications/TreeTagger',
        TAGPARFILE='C:/Applications/TreeTagger/lib/french-utf8.par')

    idx, start_time = 1, time.time()
    for doc in corpus:
        source, datetime, raw = doc[0], doc[1], doc[2]
        tags = tagr.make_tags(tagger.tag_text(clean_text(raw)))
        tags = [tag for tag in tags if type(tag) == tagr.Tag]
        # add all our elements to the array (documents)
        # each element in the array is a dictionary
        documents.append({
            'idx': idx,
            'source': source,
            'time': datetime,
            'raw': raw,
            'tags': tags
        })
        idx = progress_per(idx, len(corpus),
                           start_time)  # print the progress percentage info
    print()

    return documents
Ejemplo n.º 9
0
def choose_next_step(tree, text, context=None):
    responses = tree['next']
    responses.append("start>stop")

    tags = make_tags(tagger.tag_text(text), exclude_nottags=True)
    proper_name = []
    noun = []
    verb = []
    other = []
    for index, (word, pos, lemma) in enumerate(tags):
        lemma = lemma.lower()
        if (pos == "NAM" or pos == "ADJ") and lemma not in proper_name:
            proper_name.append(lemma)
        elif pos == "NOM" and lemma not in noun:
            noun.append(lemma)
        elif pos.startswith("VER") and lemma not in verb:
            verb.append(lemma)
        else:
            other.append(lemma)

    list_tags = proper_name + noun + verb + other

    # check for other keywords
    for tag in list_tags:
        for response in responses:
            if type(response) == str:
                response = get_tree_by_tag(response)
            if tag in response['keywords']:
                if context:
                    response['context'] = context
                return response

    # cannot find an anwser in the middle of an action => REPETE
    text_to_speech("Désolé, je n'ai pas compris")
    return REPETE
Ejemplo n.º 10
0
def lemmatizza(frase):
    frasefinale = ""
    try:
        b = frase.split()
        frasef = []
        for parole in b:
            tags = tagger.tag_text(parole)
            pos = treetaggerwrapper.make_tags(tags)
            #pprint(pos)
            for w in pos:
                if parole == "vai":
                    frasef.append("andare")
                else:
                    a = w[2].lower()
                    if "|" not in a:
                        frasef.append(w[2].lower())
                    else:
                        a = a.replace("|", "I")
                        a = re.sub(r'.*I', '', a)
                        frasef.append(a)

            frasefinale = " ".join(frasef)
    except:
        pass

    return frasefinale
def text_to_tags(text):
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR='/home/yassine/EMSE 2015-2016/Projet Recherche/tree-tagger-linux-3.2')
    tags = treetaggerwrapper.make_tags(tagger.tag_text(unicode(text,encoding='utf-8')))
    pos_tags = []
    for pos in tags:
        pos_tags.append(pos[1])
    return " ".join(pos_tags)
Ejemplo n.º 12
0
def _extract_tags(sentence):
    ''' Méthode interne d'extraction des tags (Mot, classe grammaticale, lemme) depuis le
    treetaggerwrapper
    '''
    tagged_text = _TAGGER.tag_text(sentence)
    tags = treetaggerwrapper.make_tags(tagged_text)
    return tags
Ejemplo n.º 13
0
 def lemmatize_word(self, doc):
     tags = self.lemmatizer.tag_text(doc)
     tags2 = treetaggerwrapper.make_tags(tags)
     if tags2:
         return tags2[0].lemma
     else:
         return ""
Ejemplo n.º 14
0
    def tag(self, text):
        """POS tag tokenized text."""
        if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN:
            tokens = self.__tokenizer.tokenize(text)
            return self.__tagger.tag(tokens)
        elif self.tagger_name == POSTagger.TT:
            tags = self.__tagger.tag_text(text)
            tuple_list = []
            tag_list = treetaggerwrapper.make_tags(tags)
            for item in tag_list:
                tuple_list.append((item[0], item[1]))
            return tuple_list
        elif self.tagger_name == POSTagger.SPACY:
            tags = self.__tagger(text)
            tuple_list = []
            for word in tags:
                tuple_list.append((word.orth_, word.tag_))
            return tuple_list
        else:
            pass

#tagger = POSTagger("spacy-tagger")
#doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.")
#print(tagger.tag("Ich werde morgen in die Schule gehen."))
#print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
Ejemplo n.º 15
0
def process_file(out_file_name):
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='pl')
    f = open(out_file_name + '.txt', 'r')
    w = open(out_file_name + '_lemmatized.txt', 'w')
    i = 0
    wrong_pos = ['SENT', 'interp']

    for line in f:
        try:
            with timeout(5, exception=RuntimeError):
                tags = tagger.tag_text(line)
                tag_list = []

                tags2 = treetaggerwrapper.make_tags(tags)

                for tag in tags2:
                    if tag.pos not in wrong_pos:
                        tag_list.append(tag.lemma)

                w.write(' '.join(tag_list) + '\n')

                i += 1
                if i % 100:
                    print(i)

        except RuntimeError:
            continue
def lemma(inpath, outpath, charFilter):
    ''' Lemmatisiert Texte in gegebenem Ordner inpath. '''
    for text in os.listdir(inpath):
        if text.endswith('.txt'):
            f_lemma = []
            result = ''
            t = open(inpath + '/' + text, 'r')
            f = t.read()
            tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')
            tags = tagger.tag_text(f)
            tags2 = treetaggerwrapper.make_tags(tags)
            print("text", text)
            for t in tags2:
                try:
                    result += t.lemma
                    result += ' '
                except:
                    pass
            f_lemma.append(result)
            if os.path.exists(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt'):
                txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'w')
                txtFile.write('')
                txtFile.close()
            txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'a')
            for i in f_lemma:
                txtFile.write(replace(i, charFilter + ' '))
            txtFile.close()
    return
Ejemplo n.º 17
0
def build_tree_tagger(text, source_file, output_path):
    global dir_tree_tagger
    # build a TreeTagger wrapper
    tagger = treetaggerwrapper.TreeTagger(TAGDIR=dir_tree_tagger, TAGLANG="fr")
    # tag text
    tags = tagger.tag_text(text)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    treetaggerwrapper.TreeTagger.tag_file_to(
        tagger, str(source_file), str(output_path / 'tagger_result.txt'))
    # pprint.pprint(tags)
    tags2 = treetaggerwrapper.make_tags(tags)
    # pprint.pprint(tags2)
    tag_dict = dict()
    for tag in tags2:
        if hasattr(tag, 'pos'):
            tag_dict[unicodedata.normalize('NFD',
                                           tag.word).encode('ascii',
                                                            'ignore')] = {
                                                                "pos": tag.pos,
                                                                "lemma":
                                                                tag.lemma
                                                            }
    # pprint.pprint(tags2)
    return tag_dict, tags2
Ejemplo n.º 18
0
def getwords(doc):
    tags = [
        x.lemma for x in treetaggerwrapper.make_tags(tagger.tag_text(doc),
                                                     exclude_nottags=True)
        if not re.search("[0-9a-f]{10,}|[^0-9A-Za-z]", x.lemma)
    ]
    return tags[0] if len(tags) > 0 else None
Ejemplo n.º 19
0
def part_of_speech(words):
    tags = []
    t_tags = ttw.make_tags(tagger.tag_text(unicode(' '.join(words))),
                           exclude_nottags=True)
    for tag in t_tags:
        tags.append((tag.lemma, tag.pos))
    return dict([(tag, True) for tag in tags])
Ejemplo n.º 20
0
def annotate_with_pos(articles, tagger):
    all_pos = []

    for art in articles:
        tokenized_art = []

        # Tokenize by word
        for sent in art.split('\n'):
            tokenized_art += word_tokenize(sent)
        tg_input = "\n".join([t for t in tokenized_art])

        # Apply tagger
        tg_output = tagger.tag_text(tg_input)

        # Receive tags for each word, exclude URLs and similar
        anno = treetaggerwrapper.make_tags(tg_output, exclude_nottags=True)

        pos_dict = defaultdict(int)

        # Count the POS tag occurences
        # NOTE: Saving the list of tags caused memory errors.
        for a in anno:
            pos_dict[a[1]] += 1

        all_pos.append(pos_dict)

    vec = DictVectorizer()
    pos_vectorized = vec.fit_transform(all_pos)

    # Convert the dictionary to feature matrix
    return pos_vectorized.toarray()
Ejemplo n.º 21
0
def tag(text, tt_home):
    # Default NLTK's tokenizer
    # TreebankWordTokenizer + PunktSentenceTokenizer
    nltk_start = time()
    tokens = word_tokenize(text)
    # Default NLTK's POS tagger
    # ?
    # Use tagset='universal' for universal tagset
    nltk_tagged = pos_tag(tokens)
    nltk_end = time()
    nltk_execution = nltk_end - nltk_start
    logger.info("NLTK took %f seconds" % nltk_execution)

    # TreeTagger wrapper
    # Tokenization: ?
    # Default language: English
    # English: trained on Penn treebank
    # Default flags: -token -lemma -sgml -quiet -no-unknown
    tt_start = time()
    tt = TreeTagger(TAGDIR=tt_home)
    raw_tags = tt.tag_text(text)
    tt_end = time()
    tt_execution = tt_end - tt_start
    tt_tagged = make_tags(raw_tags)
    logger.info("TreeTagger took %f seconds" % tt_execution)
    return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
def get_level(file_name):
    tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=os.getcwd())
    tags = tagger.tag_file(file_name + '.txt')
    tags2 = ttw.make_tags(tags)

# with open(file_name + '.tag', 'w') as f:
#     for tag in tags:
#         f.write("%s\n" % tag)

    import re

    words = []

    for tag in tags2:
        if re.search('^\w', tag.lemma):
            for word in tag.lemma.lower().split('-'):
                words.append(word)

    words = list(set(words))

    import word_level

    sentence_level = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    unknown_words = []

    for word in words:
        level = word_level.get_level(word)
        sentence_level[level] += 1
        if level == 0:
            unknown_words.append(word)

    print(sentence_level)
    print(unknown_words)

    return sentence_level
Ejemplo n.º 23
0
def taggerTexte(texte):
    """
    Normalise le texte et renvoie les lemmes
    
    Arguments:
        Texte
        
    Renvoie:
        liste des lemmes pertinents
    """
    texxt = texte.split("’")
    tex = "'".join(texxt)

    if detect(tex) == "fr":
        langdet = 'fr'
    if detect(tex) == 'en':
        langdet = 'en'
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=langdet)
    tags = tagger.tag_text(tex)
    tags2 = treetaggerwrapper.make_tags(tags)
    #pprint(tags2)
    empty = []
    for tag in tags2:
        tagg = tag
        empty.append(tagg)
        #print(empty)

    grammar = []
    for element in empty:
        compt = 0
        for i in element:
            if compt == 1 or compt == 2:
                grammar.append(i)
                grammar.append("\t")
            compt += 1
        grammar.append("\n")
    del grammar[-1]
    res = "".join(grammar)

    ress = res.split("\n")

    lemmes = []
    for rrr in ress:
        if len(rrr) == 0 or "@" in rrr:
            del ress[ress.index(rrr)]
    for rr in ress:

        match_tag = re.search(r"(.*)\t(.*)\t", rr)

        if "VER" in match_tag.group(1) or "NOM" in match_tag.group(
                1) or "ABR" in match_tag.group(1) or "ADJ" in match_tag.group(
                    1):
            lemmes.append(match_tag.group(2).lower())
        elif "VV" in match_tag.group(1) or "NN" in match_tag.group(
                1) or "NP" in match_tag.group(1) or "JJ" in match_tag.group(
                    1) or "VH" in match_tag.group(
                        1) or "VB" in match_tag.group(
                            1) or "MD" in match_tag.group(1):
            lemmes.append(match_tag.group(2).lower())
    return lemmes
Ejemplo n.º 24
0
def tokenize_and_lemmatize_tweets(listTweets):
    """Tokenize & lemmatize a list of texts"""
    global french_stop_words
    global mention_regex
    global LOCALTAGDIR

    # Setting up TreeTagger
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr', TAGDIR=LOCALTAGDIR)

    for t in listTweets:
        text = mention_regex.sub("", t["text"]).lower()
        tags = tagger.tag_text(text)
        tags = treetaggerwrapper.make_tags(tags)
        tokens = []
        lemma = []
        # Filtering
        for tag in tags:
            if hasattr(tag, 'word'):
                if not (len(tag.lemma) < 2 or tag.lemma in french_stop_words):
                    tokens.append(tag.word)
                    lemma.append(tag.lemma)
            else:
                token = tag.what
                if not (len(token) < 2 or token in french_stop_words):
                    if token.startswith("<repurl") or token.startswith(
                            "<repdns"):
                        token = token[token.find('"') + 1:token.rfind('"')]
                    else:
                        lemma.append(token)
                    tokens.append(token)

        t["tokenArray"] = tokens
        t["lemmaArray"] = lemma

    return listTweets
Ejemplo n.º 25
0
def split_words(path, doc_id=''):
    tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=settings.TTBin)

    tags = tagger.tag_file(path)
    tags = ttw.make_tags(tags)

    return TaggedDocument(
        tags=[doc_id], words=[tag.lemma for tag in tags if tag.pos in NN_list])
Ejemplo n.º 26
0
    def compute(self):
        """ Compute the feature value for attribute Bag of Pos Tag

        First check whether there is a seen BoP skeleton or not. If
        there isn't build_model().Walking through text_set and compute
        feature value for every text object. Counting every Pos Tag appearance
        from the text_set.

        Storing feature value in text.feature hash.
        """

        if self.bow_model is not None:
            print "BOW not None"

            for text in self._text_set:
                temp_model = dict(self.bow_model)
                tags = treetaggerwrapper.make_tags(
                    self.tagger.tag_text(text.text.decode("utf-8")))
                for tag in tags:
                    try:
                        temp_model[tag[1]] += 1
                    except KeyError:
                        continue
                text.features["bag_of_pos"] = temp_model.values()

        else:
            print "BOW is None"
            self.build_model()

            for text in self._text_set:
                # for test_case ''' test__bag_of_words__compute ''' use the OrderedDict
                # to check the values with the term_frequency in test_suitcase.resource
                #
                # temp_model = collections.OrderedDict(sorted(self.model.items()))

                temp_model = dict(self.model)
                tags = treetaggerwrapper.make_tags(
                    self.tagger.tag_text(text.text.decode("utf-8")))
                for tag in tags:
                    try:
                        temp_model[tag[1]] += 1
                    except KeyError:
                        continue
                text.features["bag_of_pos"] = temp_model.values()

            self.bow_model = self.model
Ejemplo n.º 27
0
def lemmatize_input_files():
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
    files = [f for f in glob.glob("../texts/txt/*.txt")]
    return {
        os.path.basename(f): treetaggerwrapper.make_tags(tagger.tag_file(f),
                                                         exclude_nottags=True)
        for f in files
    }
Ejemplo n.º 28
0
    def compute(self):
        """ Compute the feature value for attribute Bag of Pos Tag

        First check whether there is a seen BoP skeleton or not. If
        there isn't build_model().Walking through text_set and compute
        feature value for every text object. Counting every Pos Tag appearance
        from the text_set.

        Storing feature value in text.feature hash.
        """

        if self.bow_model is not None:
            print "BOW not None"

            for text in self._text_set:
                temp_model = dict(self.bow_model)
                tags = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8")))
                for tag in tags:
                    try:
                        temp_model[tag[1]] += 1
                    except KeyError:
                        continue
                text.features["bag_of_pos"] = temp_model.values()

        else:
            print "BOW is None"
            self.build_model()

            for text in self._text_set:
                # for test_case ''' test__bag_of_words__compute ''' use the OrderedDict
                # to check the values with the term_frequency in test_suitcase.resource
                #
                # temp_model = collections.OrderedDict(sorted(self.model.items()))

                temp_model = dict(self.model)
                tags = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8")))
                for tag in tags:
                    try:
                        temp_model[tag[1]] += 1
                    except KeyError:
                        continue
                text.features["bag_of_pos"] = temp_model.values()

            self.bow_model = self.model
def french_labeling(New, lang):

    TAGS = GetTags.get_tags(lang)

    A = [1 / (n) for n in range(1, len(TAGS) + 1)]
    A.sort()
    # Ranking the tags
    d = {}
    for i in range(len(A)):
        d[TAGS[i]] = A[len(A) - 1 - i]

    # Generating positive and negative lexicos for words labeling
    Positive, Negative = Lexicons.lexicons(lang)
    # Extracting sentences
    L = []
    if lang == 'English':
        lang = 'en'
    else:
        if lang == 'French':
            lang = 'fr'
    #1) build a TreeTagger wrapper accordng to the language:
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=lang)
    for i in range(len(New)):
        s = New['description'][i]
        s = re.sub('#', '', s)
        #2) Tagging the text
        tags = tagger.tag_text(s)
        tags2 = treetaggerwrapper.make_tags(tags)
        pos_s = 1
        neg_s = 1
        for j in range(len(tags2)):
            if tags2[j][0] in Positive or tags2[j][2] in Positive:
                if tags2[j - 1][0] in ['jamais', 'pas', 'ne', 'non']:
                    pos_s = pos_s - (pos_s * d[tags2[j][1]])
                else:
                    pos_s = pos_s + (pos_s * d[tags2[j][1]])
            else:
                if tags2[j][0] in Negative or tags2[j][2] in Negative:
                    if tags2[j - 1][0] in ['jamais', 'pas', 'ne', 'non']:
                        neg_s = neg_s - (neg_s * d[tags2[j][1]])
                    else:
                        neg_s = neg_s + (neg_s * d[tags2[j][1]])
        # Mapping scores to labels
            if pos_s / len(s) == neg_s / len(s) or abs(pos_s - neg_s) < 0.05:
                polarity = 'Neutral'
            else:
                if neg_s / len(s) > pos_s / len(s):
                    polarity = 'Negative'
                else:
                    if pos_s / len(s) > neg_s / len(s):
                        polarity = 'Positive'

        L.append(s + '\t' + polarity)
    LL = [a.split('\t')[1] for a in L]

    return (LL)
 def test_apply_gives_correct_relative_values_for_skip_grams(self):
     tags = treetaggerwrapper.make_tags(
         self.tagger.tag_text(
             'ceci est un texte très court à taguer, et un mot apparaît deux fois'
         ))
     result = SkipGramFeature().apply(tags)
     self.assertEqual({
         ('DET:ART', 'NOM'): 0.08,
         ('VER:pres', 'NOM'): 0.08
     }, result)
def comment_to_lemme(comment):
    t = treetaggerwrapper.make_tags(tagger.tag_text(comment))
    lemme = ''
    for i in t:
        if type(i) == treetaggerwrapper.Tag:
            if i.pos[:3] in ('ADJ', 'ADV', 'INT', 'KON', 'NOM', 'VER'):
                if i.lemma != 'dns-remplacé':
                    if len(i.lemma) > 1:
                        lemme = lemme + ' ' + i.lemma.split('|')[0].lower()
    return lemme
Ejemplo n.º 32
0
def tag_text(text):
    if language == 'german':
        country_code = 'de'
    else:
        country_code = 'en'
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=country_code, TAGDIR=tree_tagger_dir)
    text_with_tags = tagger.tag_text(text)

    tags = treetaggerwrapper.make_tags(text_with_tags)
    return tags
Ejemplo n.º 33
0
def sentences_to_ngrams(sentences, ngram_size, fr_nouns_file):

    ngrams = []
    context_size = int(ngram_size / 2)
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr',
                                          TAGINENC='utf-8',
                                          TAGOUTENC='utf-8')

    with open(fr_nouns_file, "r") as file:
        fr_nouns = file.readlines()

    for s in sentences:
        s = s.replace(';', '')
        s = s.replace("'", chr(39))
        s = s.replace('\'', chr(39))
        s = s.replace("d\'", " deeee ")
        s = s.replace("l\'", " leeee ")

        sentence_tagged = treetaggerwrapper.make_tags(tagger.tag_text(s))
        sentence = list(np.array(sentence_tagged)
                        [:, 0])  # getting only the token (not lemmas and POS)

        for i, token in enumerate(sentence):
            if token == "leeee":
                sentence[i] = "l\'"
            if token == 'deeee':
                sentence[i] = "d\'"

        index_left = sentence.index('[')
        index_right = sentence.index(']')

        phrase_ngram = []

        # add left context
        for i in range(context_size):
            try:
                phrase_ngram.append(sentence[index_left - context_size + i])
            except IndexError:
                # when there is not enough words (ex: pivot word starting the sentence)
                phrase_ngram.append(random.choice(fr_nouns).rstrip())

        # add pivot token(s) (can contain several tokens)
        phrase_ngram.append(' '.join(sentence[index_left + 1:index_right]))

        # add right context
        for i in range(context_size):
            try:
                phrase_ngram.append(sentence[index_right + 1 + i])
            except IndexError:
                # when there is not enough words (ex: pivot word starting the sentence)
                phrase_ngram.append(random.choice(fr_nouns).rstrip())

        ngrams.append(phrase_ngram)

    return ngrams
Ejemplo n.º 34
0
def extract_tags(text):
    ret = list()
    tags = tagger.tag_text(question)
    tags = make_tags(tags)
    for tag in tags:
        tmp = dict()
        tmp['word'] = tag.word
        tmp['lemma'] = tag.lemma.split('|')[0]
        tmp['pos'] = translate_pos(tag.pos.split(':')[0])
        ret.append(tmp)
    return ret
Ejemplo n.º 35
0
    def compute(self):
        """ Compute the feature value for attribute Adjective

        Walking through text_set and compute feature value for every
        text object.

        Storing faeture value in text.feature hash.
        """
        for text in self._text_set:
            tag_list = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8")))
            text.features["adjective"] = self.count_adj(tag_list)
Ejemplo n.º 36
0
 def lemmatise(self, text):
     """
     lemmatise a text
     """
     tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
     tags = tagger.tag_text(text)
     mytags = treetaggerwrapper.make_tags(tags)
     lemma_list=[]
     for tag in mytags:
         lemma_list.append(tag.lemma)
     return ' '.join(lemma_list)
Ejemplo n.º 37
0
    def build_model(self):
        """ Building a Bag of Pos Tag Skeleton.

        A Bag of Pos Tag Skeleton is a hash containing every unique
        Pos Tag, that is in a text from the text_set, as key.
        Initial value is an integer(0).
        """
        for text in self._text_set:
            tags = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8")))
            for tag in tags:
                self.model[tag[1]] = 0
Ejemplo n.º 38
0
 def treetag_paragraphs(self, paragraphs, tagger):
     try:
         tt_tags = [
             treetaggerwrapper.make_tags(tagger.tag_text(para.lower()),
                                         exclude_nottags=True)
             for para in paragraphs
         ]
     except:
         print(f'Treetagging error on id: {self.id}')
         tt_tags = []
     return tt_tags
Ejemplo n.º 39
0
    def compute(self):
        """ Compute the quantity of ModalVerbs in every text from the text_set

        Walking through text_set and compute feature value for every text object.

        Storing feature value in text.feature hash.
        """
        for text in self._text_set:
            count_modal = 0
            tags = treetaggerwrapper.make_tags(self.tagger.tag_text(text.text.decode("utf-8")))
            for tag in tags:
                if tag[1] in self.tag_list:
                    count_modal += 1
            text.features[self._name] = count_modal
Ejemplo n.º 40
0
	def applyTreeTagger(self, text):
		"""
		Calcs TreeTagger Result for given text if not already done

		@parameters
		text		string	text that will be tagged

		@returns	list	treetagger result
		"""
		if self.treetagged == "":
			tagger = tt.TreeTagger(TAGLANG="de")
			tagged_list = tt.make_tags(tagger.tag_text(self.cleanSource(text)))
			return tagged_list
		else:
			return self.treetagged
Ejemplo n.º 41
0
    def tag_one(self, text, skip_unknown=True, **kwargs):
        """ POS-Tags the given text, optionally skipping unknown lemmas

            :param unicode text: Text to be tagged
            :param bool skip_unknown: Automatically emove unrecognized tags from the result

            Sample usage:

            >>> from strephit.commons.pos_tag import TTPosTagger
            >>> from pprint import pprint
            >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj'))
            [Tag(word=u'sample', pos=u'NN', lemma=u'sample'),
             Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'),
             Tag(word=u'to', pos=u'TO', lemma=u'to'),
             Tag(word=u'be', pos=u'VB', lemma=u'be'),
             Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')]
        """
        return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)),
                                      skip_unknown)
Ejemplo n.º 42
0
    def compute(self):
        """ Compute the feature value for attribute SentenceStart

        Building lemmatas and tags with TreeTagger. Walking through
        text_set and compute feature value for every text object.

        Storing faeture value in text.feature hash.
        """

        for text in self._text_set:
            for sent in text.sentencelist:
                tags = treetaggerwrapper.make_tags(self.tagger.tag_text(sent))[0:2]
                try:
                    self.tuple_list_lemma.append((tags[0][2], tags[1][2]))
                    self.tuple_list_tag.append((tags[0][1], tags[1][1]))
                except IndexError:
                    continue
            text.features["sentence_start"] = [self.count_lemma(), self.count_tag()]

            self.tuple_list_tag = []
            self.tuple_list_lemma = []
Ejemplo n.º 43
0
def treetag(sentence, encoding = None): # TreeTagger helper function.
    if encoding != None:
        return treetaggerwrapper.make_tags(tagger.tag_text(unicode(sentence, "utf-8")))
    else:
        return treetaggerwrapper.make_tags(tagger.tag_text(sentence))
Ejemplo n.º 44
0
 def lemmatize_chunk(self, doc):
     tags = self.lemmatizer.tag_text(doc)
     tags2 = treetaggerwrapper.make_tags(tags)
     return [item.lemma for item in tags2]
Ejemplo n.º 45
0
 def get_tree_tagged_tokens(self):
     """takes the tokens and tags them"""
     tagger = self.tagger
     return treetaggerwrapper.make_tags(tagger.tag_text(self.tokens))
Ejemplo n.º 46
0
 def _finalize_batch(self, jobs, pos_tag_key):
     for item, job in jobs:
         job.wait_finished()
         item[pos_tag_key] = self._postprocess_tags(make_tags(job.result))
         yield item
Ejemplo n.º 47
0
 def stemmed_token_count(self, token):
     stem = treetaggerwrapper.make_tags(self.tagger.tag_text(token))[0].lemma
     return FreqDist(self.stems)[stem]
	message = re.sub(r'\n','',message)
	# delete vote coments
	for comments in voteComments:
		for comment in comments:
			message = re.sub(comment,'',message)
			message = message.replace(comment, '')

	comment = r'\([1-9]* inline comment(s)*\)'
	message = re.sub(comment,'',message)

	if message == '':
		continue

	#set message tag
	tagText = tagger.tag_text(message.decode('utf-8'))
	tags = treetaggerwrapper.make_tags(tagText, exclude_nottags=True)

	#print words
	fv = {}					 # Count of word
	words = 0				   # total count of ward in line

	for word in list(fv_df.keys()):
		count_flag[word] = False

	for tag in tags:
		words += 1
		word = tag.lemma

		if fv.has_key(word):
			fv[word]+=1
		else:
Ejemplo n.º 49
0
count_flag = {}                 # fv_dfを計算する上で必要なフラグを格納するためのディクショナリ

# 各文書の形態素解析と、単語の出現回数の計算
for txt_id, txt in enumerate(text):
    # MeCabを使うための初期化
    #tagger = MeCab.Tagger("-d /var/lib/mecab/dic/ipadic-utf8")
    #tagger.parse('')
    #node = tagger.parseToNode(txt)

    # TreeTaggerを使うための初期化

    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR='../treetagger')
    tagText = tagger.tag_text(txt)

    node = treetaggerwrapper.make_tags(tagText, exclude_nottags=False)



    fv = {}                     # 単語の出現回数を格納するためのディクショナリ
    words = 0                   # ある文書の単語の総出現回数

    for word in list(fv_df.keys()):
        count_flag[word] = False

    while node.next:
        node = node.next
        surface = node.surface # 形態素解析により得られた単語

        words += 1
Ejemplo n.º 50
0
	sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/spanish.pickle'))

stop_words = nltk.corpus.stopwords.words('spanish') 
non_alphabetic = re.compile("\W|\d")
words = []
tags = []

# Using TreeTagger 
# 1) pip install treetaggerwrapper
# 2) put treetragger in %PYHOME%\Lib\site-packages\TreeTagger
# 3) put spanish-utf8.par and spanish-chunker.par in \TreeTagger\lib
# See http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish-tagset.txt for tag meanings
tagger = treetaggerwrapper.TreeTagger(TAGLANG='es')
for sentence in article_corpus.sents():
	tagged_sentence = tagger.tag_text(sentence) 
	tags.extend(treetaggerwrapper.make_tags(tagged_sentence))

#TODO: create a tagger script, save the tagged files
#TODO: look at alternate taggers, compare

#TODO: profile this and see which part is taking so long
for tag in tags:
	lemma = tag[2].lower()
	if lemma not in stop_words and not non_alphabetic.search(lemma):
		words.append(lemma)

freq_dist = FreqDist(words)

with open('./frequency_distribution.txt', 'w', encoding='utf-8') as f:
	f.write("word, number of occurences\n")
	for word in freq_dist.most_common():
Ejemplo n.º 51
0
def tag_pos(text, language):
  ''' Tag parts-of-speech in text; return tagged text '''
  # ttw will throw an error if the code isn't supported
  tagger = ttw.TreeTagger(TAGLANG=lang_codes[language])
  tags = tagger.tag_text(text)
  return ttw.make_tags(tags)