Beispiel #1
0
def main(argv):
    parser = get_parser()
    preprocessor = PreprocessingClass()
    tagger_en = treetaggerwrapper.TreeTagger(TAGLANG='en', TAGDIR="C:\\TreeTagger")
    tagger_es = treetaggerwrapper.TreeTagger(TAGLANG='es', TAGDIR="C:\\TreeTagger")
    tagger_nl = treetaggerwrapper.TreeTagger(TAGLANG='nl', TAGDIR="C:\\TreeTagger")

    (options, args) = parser.parse_args(argv)

    if not (options.input and options.output):
        parser.error("Required arguments not provided")
    else:
        lang = detect_language(options.input)
        if lang.lower() not in ['en', 'es', 'nl']:
            print >> sys.stderr, 'Language other than en, es, nl'
            sys.exit(1)
        else:
            print
            print "Current Language: ", lang

            # final_model_path = options.model

            final_model_path = change_path_to_windows_style(options.model)
            final_output_path = change_path_to_windows_style(options.output)

            if lang.lower() == "en":
                main_classifier = FinalClassicationClass(lang.lower(), options.input, final_output_path, None, 2)

                dataset_input = preprocessor.read_all_files(options.input, "Test Set", lang.lower())
                X, y_author = preprocessor.split_lists_dev(dataset_input)
                X_list_pos_tags, X_list_lemma = preprocessor.stem_and_pos(X, tagger_en)
                pipelined_dictionary = preprocessor.create_pipeline_dict(X, X_list_lemma, X_list_pos_tags, y_author)
                main_classifier.dataset_statistics_dev(X, y_author)
                # load models
                gender_model, age_model = load_model(final_model_path, lang.lower())
                main_classifier.test_model(gender_model, pipelined_dictionary, lang, y_author, final_output_path, age_model)

            elif lang.lower() == "nl":
                main_classifier = FinalClassicationClass(lang.lower(), options.input, final_output_path, None, 2)

                dataset_input = preprocessor.read_all_files(options.input, "Test Set", lang.lower())
                X, y_author = preprocessor.split_lists_dev(dataset_input)
                X_list_pos_tags, X_list_lemma = preprocessor.stem_and_pos(X, tagger_nl)
                pipelined_dictionary = preprocessor.create_pipeline_dict(X, X_list_lemma, X_list_pos_tags, y_author)
                main_classifier.dataset_statistics_dev(X, y_author)
                # load models
                gender_model = load_model(final_model_path, lang.lower())
                main_classifier.test_model(gender_model, pipelined_dictionary, lang, y_author, final_output_path)

            elif lang.lower() == "es":
                main_classifier = FinalClassicationClass(lang.lower(), options.input, final_output_path, None, 2)

                dataset_input = preprocessor.read_all_files(options.input, "Test Set", lang.lower())
                X, y_author = preprocessor.split_lists_dev(dataset_input)
                X_list_pos_tags, X_list_lemma = preprocessor.stem_and_pos(X, tagger_es)
                pipelined_dictionary = preprocessor.create_pipeline_dict(X, X_list_lemma, X_list_pos_tags, y_author)
                main_classifier.dataset_statistics_dev(X, y_author)
                # load models
                gender_model, age_model = load_model(final_model_path, lang.lower())
                main_classifier.test_model(gender_model, pipelined_dictionary, lang, y_author, final_output_path, age_model)
Beispiel #2
0
 def __init__(self, index_path="/INDEX"):
     self.index = {}
     self.index_document = {}
     self.plain_word_fr = re.compile("ABR|ADJ|NAM|NOM|VER")
     self.plain_word_en = re.compile("JJ|NP|NN|VB")
     self.fr_tagger = treetaggerwrapper.TreeTagger(TAGLANG="fr")
     self.en_tagger = treetaggerwrapper.TreeTagger(TAGLANG="en")
     self.save_folder = index_path
     self.keep_path = self.save_folder + "/documentsIndex"
     self.index_name = self.save_folder + "/index.json"
     self.index_document_name = self.save_folder + "/index_document.json"
def tag_poslemma(dirName):     
    
    # TAGGED-ALL
    writer=csv.writer(open('dictionaries/taggedAll_' + dirName + '.csv', 'w'))
    
    # OPEN FILES IN DIRECTORY
    for witness in os.listdir('data/' + dirName):
        if witness.endswith(".txt"):
            with open('data/' + dirName + '/' + witness) as wit:
                witText = wit.read()

                # TAG USING FRO
                taggerFro = treetaggerwrapper.TreeTagger(TAGLANG='froBfm')
                tagsFro = taggerFro.tag_text(witText)  # LIST WITH TAGGED WORDS (FRO)

                # TAG USING STEIN
                taggerStein = treetaggerwrapper.TreeTagger(TAGLANG='stein')
                tagsSteinDirt = taggerStein.tag_text(witText) # dirst, because has too much info and symbols
                tagsSteinStr = '\n'.join(tagsSteinDirt)  # list to string  
                # CLEAN OUTPUT STEIN
                patterns = [('_.*', ''),
                            ('\d.*', ''),
                          # ('\|.*', ''), so that the different output possibilities are saved
                            ('�', 'ö'),  # encoding problem, but it does not seem to depend on the TreeTaggerWrapper, nor on the script. Maybe on the lexicon? Anyway, this is not real solution but works
                            ('<nolem>', 'UNKNOWN')]
                for (p1,p2) in patterns:
                    p = re.compile(p1)
                    tagsSteinStr = p.sub(p2, tagsSteinStr)
                tagsStein = tagsSteinStr.split('\n')  # LIST WITH TAGGED WORDS (STEIN)
                
                for itemFro, itemStein in zip(tagsFro, tagsStein):
                    token = itemFro.split('\t')[0]
                    pos = itemFro.split('\t')[1]
                    lemma = itemStein.split('\t')[2]
                    item = token + '\t' + pos + '_' + lemma
                    writer.writerow([item]) # populate the file with items (made by token, pos and lemma)
                    

    # TAGGED-DISTINCT
    reader=csv.reader(open('dictionaries/taggedAll_' + dirName + '.csv', 'r'), delimiter='\t')
    writer=csv.writer(open('dictionaries/taggedDistinct_' + dirName + '.csv', 'w'), delimiter=',')
    entries = set()
    writer.writerow(['Original', 'Normalised'])
    for row in reader:
        key = (row[0], row[1])
        if key not in entries:
            writer.writerow(row)
            entries.add(key)
def tokenize_and_lemmatize_tweets(listTweets):
    """Tokenize & lemmatize a list of texts"""
    global french_stop_words
    global mention_regex
    global LOCALTAGDIR

    # Setting up TreeTagger
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr', TAGDIR=LOCALTAGDIR)

    for t in listTweets:
        text = mention_regex.sub("", t["text"]).lower()
        tags = tagger.tag_text(text)
        tags = treetaggerwrapper.make_tags(tags)
        tokens = []
        lemma = []
        # Filtering
        for tag in tags:
            if hasattr(tag, 'word'):
                if not (len(tag.lemma) < 2 or tag.lemma in french_stop_words):
                    tokens.append(tag.word)
                    lemma.append(tag.lemma)
            else:
                token = tag.what
                if not (len(token) < 2 or token in french_stop_words):
                    if token.startswith("<repurl") or token.startswith(
                            "<repdns"):
                        token = token[token.find('"') + 1:token.rfind('"')]
                    else:
                        lemma.append(token)
                    tokens.append(token)

        t["tokenArray"] = tokens
        t["lemmaArray"] = lemma

    return listTweets
Beispiel #5
0
def postag_directory(input_directory, output_directory):
    """
    This function POS-tag a directory full of documents (opinions, tweets, comments)


    Args:
        input_directory: The

        output_directory:

    """
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
    # Loop over the files
    all_tags = []
    for filename in sorted(glob.glob(os.path.join(input_directory, '*.txt'))):
        with codecs.open(filename, encoding='utf-8') as f:
            # Read the file
            content = f.read()
            # Tag it
            tags = tagger.tag_text(content)
            # add those tags to the master tag list
            all_tags.append(tags)

    for i, a_list in enumerate(all_tags):
        new_dir_path = output_directory
        path = os.path.join(new_dir_path, "list%d.txt" % i)
        with open(path, "w") as f:
            for item in a_list:
                f.write(item + "\n")
Beispiel #6
0
def postag_pandas(input_file, output_file):
    def postag_string(s):
        '''Returns tagged text from string s'''
        if isinstance(s, basestring):
            s = s.decode('UTF-8')
        return tagger.tag_text(s)

    # Reading in the file
    all_lines = []
    with open(input_file) as f:
        for line in f:
            all_lines.append(line.strip().split('|', 1))

    df = pd.DataFrame(all_lines[1:], columns=all_lines[0])

    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')

    df['POS-tagged_content'] = df['content'].apply(postag_string)

    # Format fix:
    def fix_format(x):
        '''x - a list or an array'''
        # With encoding:
        out = list(tuple(i.encode().split('\t')) for i in x)
        # or without:
        # out = list(tuple(i.split('\t')) for i in x)
        return out

    df['POS-tagged_content'] = df['POS-tagged_content'].apply(fix_format)
    df['content'] = df['content'].map(lambda x: x.lstrip('"""' ''))

    print list(df.columns.values)
    return df.to_csv(output_file, sep='|', index=False)
	def __init__(self, input, output):#, tagdir='/opt/treetagger'):
		self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')#, TAGDIR=tagdir)
		self.inp = input
		self.outp = output
		with open(self.inp + 'news_stops') as f:
			self.stopwords = f.read().strip().split()	
		self.excltags = ['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'RP', 'SYM', 'TO', 'WDT', 'WP', 'WP$', 'WRB', 'VB', 'VBD', 'VBG', 'VBN', 'VBZ', 'VBP', 'VD', 'VDD', 'VDG', 'VDN', 'VDZ', 'VDP', 'VH', 'VHD', 'VHG', 'VHN', 'VHZ', 'VHP', 'RB', 'RBR', 'RBS', 'JJ', 'JJR', 'JJS']
def Tagger(dataset):
    # ------------Ouverture des fichiers-----------------------------------#
    D = open(dataset, 'r')
    Tag = open("DataTAG.txt", 'w')

    # ------------Liste des tags a prendre en concideration-----------------#
    ListeTags = [
        "JJ", "JJR", "JJS", "VV", "VVD", "VVG", "VVN", "RBR", "RBS", "UH", "RB"
    ]

    # ------------Config du wrapper------------------------------------------#
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',
                                          TAGDIR='./TreeTagger',
                                          TAGINENC='utf-8',
                                          TAGOUTENC='utf-8')

    # ------------Analyse:----------------------------------------------------#
    # Pour chaque ligne du dataset ,treetagger analyse le commentaire
    # et renvoie les mots importants dans le fichier DataTAG

    for line in D:
        line = re.sub('http:\/\/[0-9a-zA-Z-_\.]*(\.[a-z]{0,9}\/?)?',
                      "website ", line)  #supression des url
        line = re.sub(
            "[^a-zA-Z0-9 '-]", " ", line
        )  # suppression de tous les carracteres speciaux (sauf :',- et l'espace)
        tags = tagger.TagText(line.decode(
            encoding="utf-8"))  # "taggage" de chaque mot du commentaire
        for words in tags:  #recuperations des mots qui possedent les tags de ListTags
            w = words.split("\t")

            if w[1] in ListeTags:
                Tag.write(w[0] + " ")
        Tag.write("\n")
Beispiel #9
0
 def _set_treetagger(self, language):
     import treetaggerwrapper as ttw
     try:
         self._tagger = ttw.TreeTagger(TAGLANG=language)
         self.morphy = self._treetagger_morphy
     except ttw.TreeTaggerError:
         raise (ImportError)
	def _useTagger(self, langID:str):
		assert isinstance(langID, str)

		with self.__mainLock:
			langIDCache = self.__unused.get(langID, None)
			if langIDCache is None:
				langIDCache = PoolOfThreadedTreeTaggers._LangSpecificCache(langID)
				self.__unused[langID] = langIDCache

		langIDCache.touch()

		if langIDCache.idleInstances:
			with langIDCache.langLock:
				tagger = langIDCache.idleInstances[-1]
				del langIDCache.idleInstances[-1]
				langIDCache.countUsedInstances += 1
		else:
			tagger = ttpw.TreeTagger(
				TAGLANG=langID,
				TAGOPT="-prob -threshold 0.7 -token -lemma -sgml -quiet",
				TAGDIR=self.__treeTaggerInstallationPath)
			self.__onTaggerCreated.fire(self, langID)
			with langIDCache.langLock:
				langIDCache.countUsedInstances += 1

		try:
			yield tagger
		finally:
			with langIDCache.langLock:
				langIDCache.countUsedInstances -= 1
				langIDCache.idleInstances.append(tagger)
def get_level(file_name):
    tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=os.getcwd())
    tags = tagger.tag_file(file_name + '.txt')
    tags2 = ttw.make_tags(tags)

# with open(file_name + '.tag', 'w') as f:
#     for tag in tags:
#         f.write("%s\n" % tag)

    import re

    words = []

    for tag in tags2:
        if re.search('^\w', tag.lemma):
            for word in tag.lemma.lower().split('-'):
                words.append(word)

    words = list(set(words))

    import word_level

    sentence_level = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    unknown_words = []

    for word in words:
        level = word_level.get_level(word)
        sentence_level[level] += 1
        if level == 0:
            unknown_words.append(word)

    print(sentence_level)
    print(unknown_words)

    return sentence_level
Beispiel #12
0
 def __init__(self, text):
     tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR='/home/lr/hayashi/ra_web_app')
     self.text = text
     #小文字にすると拾えない
     self.sentences = sent_tokenize(self.text)
     self.tagged = [tagger.TagText(sentence) for sentence in self.sentences]
     self.parsed = [' '.join(sentence).replace('\t', '_') for sentence in self.tagged]
def build_corpus(filename):
    corpus = ihmm.corpus()
    # 訓練データを形態素解析して各品詞ごとにその品詞になりうる単語の総数を求めておく
    sentence_list = []
    with codecs.open(filename, "r", "utf-8") as f:
        for sentence_str in f:
            sentence_list.append(sentence_str)
    with codecs.open(filename, "r", "utf-8") as f:
        tagger = treetaggerwrapper.TreeTagger(TAGLANG="en")
        for i, sentence_str in enumerate(f):
            sentence_str = sentence_str.strip()
            if (i + 1) % 10 == 0:
                printr("データを準備しています ... {}".format(i + 1))
            result = tagger.tag_text(sentence_str)
            if len(result) == 0:
                continue
            # 形態素解析を行いながら訓練データも作る
            # 英語は通常スペース区切りなので不要と思うかもしれないが、TreeTaggerを使うと$600が$ 600に分割されたりする
            # そのためplot_en.pyで評価の際に文の単語数が[スペース区切り]と[TreeTagger]で異なる場合があり正しく評価を行えなくなる
            # よって単語分割は全てTreeTaggerによるものに統一しておく
            words = []
            for metadata in result:
                metadata = metadata.split("\t")
                if len(metadata) == 3:
                    word, true_tag, lowercase = metadata
                    true_tag = collapse_true_tag(true_tag, lowercase)
                else:
                    lowercase = metadata[0]
                words.append(lowercase)
            # データを追加
            corpus.add_words(words)

    return corpus
Beispiel #14
0
def postTreatment(TW):
# Fonction destinée a transformer les nombres écrits en lettres en chiffres
# non implémenté : rassemblement de token : pomme de terre en un seul token par exemple...
def preTreatment(TW):
    for w in TW:
        if w.posTag == "NUM" and w.lemma != "@card@":
            try:
                w.word = numbers[w.word]
            except:
                ans = input("Je ne comprends pas bien ce nombre : " + w.word + ". Pourriez-vous l'écrire en chiffres ?\n")
                taggedAns = formatTTG(tagger.TagText(ans))
                for wAns in taggedAns:
                    if wAns.posTag == "NUM" and wAns.lemma == "@card@":
                        numbers[w.word] = int(wAns.word)
                        w.word = numbers[w.word]
    return TW
    
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr', TAGDIR='./Ressources', TAGINENC='utf-8', TAGOUTENC='utf-8')
demande = input("Bonjour ! Que puis-je pour vous aujourd'hui ?\n")
taggedWords = formatTTG(tagger.TagText(demande))
taggedWords = postTreatment(taggedWords)

if(eatingIntention(taggedWords)):
    r = Request()
    r.fillRequest(taggedWords)
    r.printRequest()
def tok(tex):
    """
		Tag le texte et renvoie le texte taggé sous format facile à lire pour l'utilisateur en colonnes
		Entrée: texte
		Output: texte taggée
	"""
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
    tags = tagger.tag_text(tex)
    tags2 = treetaggerwrapper.make_tags(tags)
    pprint.pprint(tags2)
    empty = []
    for tag in tags2:
        tagg = tag
        empty.append(tagg)
        print(empty)

    grammar = []
    for element in empty:
        for i in element:
            #			if element.index(i)==0 or element.index(i)==1:
            grammar.append(i)
            grammar.append("\t")
        grammar.append("\n")
        res = "".join(grammar)

    return "{}".format(res)
def pos_tag(inp: str, out, tagdir: str = '/usr/local/tree-tagger'):
    #Generates POS representation of data and pickles output into a jar.
    texts, genders, ages = read_data(inp)
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',
                                          TAGDIR=tagdir,
                                          TAGOPT='-token -sgml')
    logging.info('POS tagging data')

    pos_texts = []
    d_infothresholds = {
        int((i / 100.0 * len(texts))): "%i%%" % (i)
        for i in range(0, 101)
    }
    for i, t in enumerate(texts):
        tags = [
            el.split('\t')[1] for el in tagger.tag_text(t)
            if len(el.split()) == 2
        ]
        pos_texts.append(' '.join(tags))
        if i in d_infothresholds.keys():
            logging.info('{} of documents processed'.format(
                d_infothresholds[i]))

    logging.info('Pickling results to {}'.format(out.name))
    pickle.dump((pos_texts, genders, ages), out)
Beispiel #17
0
def process_files(source_path,
                  target_path,
                  languages=['en'],
                  exception_file=None):
    source_path = Path(source_path)
    target_path = Path(target_path)
    for l in languages:
        tagger = treetaggerwrapper.TreeTagger(TAGLANG=l)
        ipath = Path(l) / '**/*.gz'
        paths = list(source_path.glob(str(ipath)))
        exceptions = None
        if exception_file:
            with open(exception_file) as f:
                exceptions = json.loads(f.read())[l]
                if isinstance(exceptions, dict):
                    exception_terms = []
                    for k in exceptions:
                        exception_terms += exceptions[k]
                    exceptions = exception_terms
        for f_i, path in tqdm(enumerate(paths), desc=l, total=len(paths)):
            with gzip.open(path, 'rt') as source:
                target_file = target_path / path.relative_to(source_path)
                os.makedirs(target_file.parent, exist_ok=True)

                with gzip.open(target_file, 'wt') as target:
                    for line in source:
                        if line.startswith('\n') or line.startswith(
                                '\t') or line.startswith(
                                    ' ') or line.startswith('<'):
                            continue
                        p = ' '.join(process(line, l, tagger, exceptions))

                        target.write(p + '\n')
def lemma(inpath, outpath, charFilter):
    ''' Lemmatisiert Texte in gegebenem Ordner inpath. '''
    for text in os.listdir(inpath):
        if text.endswith('.txt'):
            f_lemma = []
            result = ''
            t = open(inpath + '/' + text, 'r')
            f = t.read()
            tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')
            tags = tagger.tag_text(f)
            tags2 = treetaggerwrapper.make_tags(tags)
            print("text", text)
            for t in tags2:
                try:
                    result += t.lemma
                    result += ' '
                except:
                    pass
            f_lemma.append(result)
            if os.path.exists(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt'):
                txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'w')
                txtFile.write('')
                txtFile.close()
            txtFile = open(outpath + text.split('_')[1]+'_'+text.split('_')[2]+'.txt', 'a')
            for i in f_lemma:
                txtFile.write(replace(i, charFilter + ' '))
            txtFile.close()
    return
def taggerTexte(texte):
    """
    Normalise le texte et renvoie les lemmes
    
    Arguments:
        Texte
        
    Renvoie:
        liste des lemmes pertinents
    """
    texxt = texte.split("’")
    tex = "'".join(texxt)

    if detect(tex) == "fr":
        langdet = 'fr'
    if detect(tex) == 'en':
        langdet = 'en'
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=langdet)
    tags = tagger.tag_text(tex)
    tags2 = treetaggerwrapper.make_tags(tags)
    #pprint(tags2)
    empty = []
    for tag in tags2:
        tagg = tag
        empty.append(tagg)
        #print(empty)

    grammar = []
    for element in empty:
        compt = 0
        for i in element:
            if compt == 1 or compt == 2:
                grammar.append(i)
                grammar.append("\t")
            compt += 1
        grammar.append("\n")
    del grammar[-1]
    res = "".join(grammar)

    ress = res.split("\n")

    lemmes = []
    for rrr in ress:
        if len(rrr) == 0 or "@" in rrr:
            del ress[ress.index(rrr)]
    for rr in ress:

        match_tag = re.search(r"(.*)\t(.*)\t", rr)

        if "VER" in match_tag.group(1) or "NOM" in match_tag.group(
                1) or "ABR" in match_tag.group(1) or "ADJ" in match_tag.group(
                    1):
            lemmes.append(match_tag.group(2).lower())
        elif "VV" in match_tag.group(1) or "NN" in match_tag.group(
                1) or "NP" in match_tag.group(1) or "JJ" in match_tag.group(
                    1) or "VH" in match_tag.group(
                        1) or "VB" in match_tag.group(
                            1) or "MD" in match_tag.group(1):
            lemmes.append(match_tag.group(2).lower())
    return lemmes
Beispiel #20
0
    def fit_transform(self):

        # Parsing (lemmatisation and pos-tagging)
        tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
        df['tags'] = self.df[self.text_column].apply(
            lambda x: treetaggerwrapper.make_tags(tagger.tag_text(x)))
        df['lemma'] = df.tags.apply(lambda x: [(t.lemma).lower() if isinstance(
            t, treetaggerwrapper.Tag) else '' for t in x])
        df['text_lemma'] = df.apply(lambda row: " ".join(row.lemma), axis=1)
        df['pos'] = df.tags.apply(
            lambda x:
            [t.pos if isinstance(t, treetaggerwrapper.Tag) else '' for t in x])

        # surface based features
        df['number_verbs'] = df.pos.apply(lambda x: self.number_verbs(x))
        df['number_proper_nouns'] = df.pos.apply(
            lambda x: self.number_proper_nouns(x))
        df['number_imperative_verb'] = df.pos.apply(
            lambda x: self.number_imperative_verb(x))

        # sentiment features
        lex_dict = self.lex_df.to_dict('index')
        list_lex_exp = df.lemma.apply(lambda x: [
            lex_dict[key] if self.has_expression(exp=lex_dict[key]['lemma'],
                                                 string_ls=x,
                                                 group=lex_dict[key]['group'])
            == True else None for key in lex_dict.keys()
        ])
        list_lex_exp = list_lex_exp.apply(lambda x: list(filter(None, x)))
        has_int = self.has_intensifier(list_lex_exp)
        avg_pol = self.avg_polarity(list_lex_exp)

        df['has_intensifier'] = has_int
        df['avg_polarity'] = avg_pol
        return df
Beispiel #21
0
def analyze_ft_others(outPath, name):
    """
    Calculate readability metrics for fulltexts.

    :param outPath: Raw fulltext directory
    :param name: Name of journal to be processed
    :return: Dataframe with calculated readability metrics for fulltexts in outPath/metrics/

    """

    from functions import readabilityFunctions as rf
    import treetaggerwrapper

    print("Processing fulltexts of journal " + name)

    inPath = outPath + 'pmid/' + name + '.json'
    outPathMetrics = outPath + 'metrics/' + name + '_metrics.json'

    rf.analyze(path=inPath,
               spath=outPathMetrics,
               tagger=treetaggerwrapper.TreeTagger(TAGLANG='en'),
               textType='body',
               columnList={
                   'year', 'pmid', 'doi', 'strippedText', 'wordLength',
                   'wordCount', 'sentenceCount', 'sylCount', 'flesch', 'NDC',
                   'PercDiffWord', 'DiffWord_lst'
               })
    def __init__(self, data):
        self.processed_data = []
        self.labeling = []
        nes = ['Name', 'Surname', 'Location']

        counter = 0
        self.tagger = treetaggerwrapper.TreeTagger(TAGLANG='ru')

        # 890 : 920 зависает

        for el in tqdm(data[:890] + data[920:]):
            elem = deepcopy(el)
            lemmas = lemmatize_words(elem['text'])

            elem['text'] = ' '.join(lemmas)
            sents = sent_tokenize(elem['text'])

            for sent in sents:
                tokens, labels = self.word2features(sent, elem)
                self.processed_data.append(tokens)
                self.labeling.append(labels)

            for ne in nes:
                if elem[ne] != []:
                    raise NormalizeError((elem[ne], elem['text'],
                                          self.pos_tagging(elem['text'])))

            counter += 1
Beispiel #23
0
def get_documents(docs, stopwords):
    """Extrait les documents du corpus
    :param corpus: [(source,datetime,text)]"""

    documents = list()
    corpus = [
        (doc[0], doc[1], doc[2]) for doc in docs if len(doc[2].split()) > 3
    ]  # supprime les lignes courtes de moins de 4 mots

    tagger = tagr.TreeTagger(
        TAGLANG='fr',
        TAGDIR='c:/Applications/TreeTagger',
        TAGPARFILE='C:/Applications/TreeTagger/lib/french-utf8.par')

    idx, start_time = 1, time.time()
    for doc in corpus:
        source, datetime, raw = doc[0], doc[1], doc[2]
        tags = tagr.make_tags(tagger.tag_text(clean_text(raw)))
        tags = [tag for tag in tags if type(tag) == tagr.Tag]
        # add all our elements to the array (documents)
        # each element in the array is a dictionary
        documents.append({
            'idx': idx,
            'source': source,
            'time': datetime,
            'raw': raw,
            'tags': tags
        })
        idx = progress_per(idx, len(corpus),
                           start_time)  # print the progress percentage info
    print()

    return documents
Beispiel #24
0
    def __init__(self):

        # several attributes for language informations
        self._plainText = ""
        self._filteredText = ""

        self._tokens = []
        self._tokensAndPOS = []
        self._lemmas = []
        self._lemmasWithoutStopwords = []
        self._lemmasAndPOS = []
        self._lemmaAndPOSDict = {}

        self._lemmasWithLanguageInfo = []

        self._lemmasAndPOSAndTokensDict = {}

        self._stopwords = []
        self._stopwords_lemmatized = []

        self._currentDramaName = ""
        self._tokensWithoutStopwords = []

        self._stopwordLists = [
            "standardList", "enhancedList", "enhancedFilteredList"
        ]

        self._tagger = treetaggerwrapper.TreeTagger(TAGLANG='de')
def get_tagger(language: str):
    """
    :param language: language code e.g. de,nl
    :return: tagger object
    """
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=language)
    return tagger
Beispiel #26
0
def build_tree_tagger(text, source_file, output_path):
    global dir_tree_tagger
    # build a TreeTagger wrapper
    tagger = treetaggerwrapper.TreeTagger(TAGDIR=dir_tree_tagger, TAGLANG="fr")
    # tag text
    tags = tagger.tag_text(text)
    if not output_path.exists():
        output_path.mkdir(parents=True)
    treetaggerwrapper.TreeTagger.tag_file_to(
        tagger, str(source_file), str(output_path / 'tagger_result.txt'))
    # pprint.pprint(tags)
    tags2 = treetaggerwrapper.make_tags(tags)
    # pprint.pprint(tags2)
    tag_dict = dict()
    for tag in tags2:
        if hasattr(tag, 'pos'):
            tag_dict[unicodedata.normalize('NFD',
                                           tag.word).encode('ascii',
                                                            'ignore')] = {
                                                                "pos": tag.pos,
                                                                "lemma":
                                                                tag.lemma
                                                            }
    # pprint.pprint(tags2)
    return tag_dict, tags2
Beispiel #27
0
def process_file(out_file_name):
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='pl')
    f = open(out_file_name + '.txt', 'r')
    w = open(out_file_name + '_lemmatized.txt', 'w')
    i = 0
    wrong_pos = ['SENT', 'interp']

    for line in f:
        try:
            with timeout(5, exception=RuntimeError):
                tags = tagger.tag_text(line)
                tag_list = []

                tags2 = treetaggerwrapper.make_tags(tags)

                for tag in tags2:
                    if tag.pos not in wrong_pos:
                        tag_list.append(tag.lemma)

                w.write(' '.join(tag_list) + '\n')

                i += 1
                if i % 100:
                    print(i)

        except RuntimeError:
            continue
Beispiel #28
0
def run_treetagger(text, language):
    """
    Runs treetagger on the text string. 
    Returns a treetagger tagged object. 
    """
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=language)
    tagged = tagger.tag_text(text)
    return tagged
Beispiel #29
0
def lemmatize_input_files():
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
    files = [f for f in glob.glob("../texts/txt/*.txt")]
    return {
        os.path.basename(f): treetaggerwrapper.make_tags(tagger.tag_file(f),
                                                         exclude_nottags=True)
        for f in files
    }
Beispiel #30
0
def split_words(path, doc_id=''):
    tagger = ttw.TreeTagger(TAGLANG='en', TAGDIR=settings.TTBin)

    tags = tagger.tag_file(path)
    tags = ttw.make_tags(tags)

    return TaggedDocument(
        tags=[doc_id], words=[tag.lemma for tag in tags if tag.pos in NN_list])