def test_one(text, debug, outfile): analyzer = qanalex.Analex(cache_path="cache/") analyzer.disable_allow_cache_use() #~ analyzer.enable_fully_vocalized_input() debug = True analyzer.set_debug(debug) #~ tokens = araby.tokenize(text) #~ if tokens: #~ text = tokens[0] result = analyzer.check_text(text) adapted_result = [] for i, analyzed_list in enumerate(result): for analyzed in analyzed_list: adapted_result.append(analyzed.__dict__) df = pd.DataFrame(adapted_result) print df.columns.values #~ print(df.columns.values) #~ print(df.head(12)) display = df[[ 'vocalized', 'unvocalized', 'word', 'stem', 'type', 'root', 'original', "tags" ]] display = display.drop_duplicates() display.to_csv(outfile, sep='\t', encoding="utf8")
def test_quran(text, debug, outfile): analyzer = qanalex.Analex(cache_path="cache/") analyzer.disable_allow_cache_use() analyzer.enable_fully_vocalized_input() analyzer.set_debug(debug) result = analyzer.check_text(text) adapted_result = [] for i, analyzed_list in enumerate(result): for analyzed in analyzed_list: adapted_result.append(analyzed.__dict__) df = pd.DataFrame(adapted_result) print df.columns.values #~ print(df.columns.values) #~ print(df.head(12)) display = df[[ 'vocalized', 'unvocalized', 'word', 'stem', 'type', 'root', 'original', "tags" ]] display = display.drop_duplicates() #~ print(display.head(10)) #~ print(display) #~ print("root exists ", ('root' in df.columns)) display.to_csv(outfile, sep='\t', encoding="utf8") display_unknown = display[display.type == "unknown"] display_unknown.to_csv(outfile + ".unknown.csv", sep='\t', encoding='utf8') display_known = display[display.type != "unknown"] display_known.to_csv(outfile + ".known.csv", sep='\t', encoding='utf8') print("Unknown ", display_unknown.count()) print("known ", display_known.count())
#!/usr/bin/env python import componentes import analex import anasintac import flujo import string import sys import AST from sys import argv from sets import ImmutableSet ############################################################################ # # Funcion: __main__ # Tarea: Programa principal # Prametros: -- # Devuelve: -- # ############################################################################ if __name__ == "__main__": script, filename = argv txt = open(filename) #print "Este es tu fichero %r" % filename fl = flujo.Flujo(txt) analex = analex.Analex(fl) anasintac.Anasintac().Analiza(analex)
myfile = open(filename) text = (myfile.read()).decode('utf8') try: myfile = open(filename) text = (myfile.read()).decode('utf8') if text == None: text = u"السلام عليكم يستعملونهم" except: text = u"السلام عليكم يستعملونهم" print(" given text") debug = False limit = 500 analyzer = analex.Analex(cache_path="cache/") #ianalyzer.disable_allow_cache_use() analyzer.set_debug(debug) result = analyzer.check_text(text) import pandas as pd adapted_result = [] for i, analyzed_list in enumerate(result): for analyzed in analyzed_list: adapted_result.append(analyzed.__dict__) df = pd.DataFrame(adapted_result) print(df.columns.values) print(df.head(12)) display = df[['word', 'stem', 'type', 'root']]
# <Signo> -> + pass elif self.componente.cat == "OpAdd": # <Signo> -> - pass else: raise errores.ErrorSintactico( "Error sintáctico analizaTermino()") except errores.Error as err: sys.stderr.write("%s\n" % err) def sincroniza(self, sinc): sinc |= "eof" # Nos aseguramos de que este eof while self.componente.cat not in sinc: self.avanza() if __name__ == "__main__": script, filename = argv txt = open(filename) print("Este es tu fichero %r" % filename) i = 0 TS = TS.TS() fl = flujo.Flujo(txt) analexi = analex.Analex(fl) anasint = Anasin(analexi, TS) try: arbolPrograma = anasint.analizaPrograma() anasint.comprueba("eof") except errores.Error as err: sys.stderr.write("%s\n" % err)