Ejemplo n.º 1
0
def fic2text(ident):
   textsegs = Loader.get_field(data['fics'],ident,'fic') 
   rtags = Loader.get_field(data['base'],ident,'tags')
   rtext = ""

   for line in textsegs:
      line = line.replace(u'\xa0',' ')
      s = re.sub('([.,!?()])', r' \1 ', line)
      s = re.sub('\s{2,}', ' ', line)
      line = line.encode('ascii', 'ignore').decode('ascii')
      rtext += line+" "

   tags = []
   for genre in rtags:
      for el in rtags[genre]:
         tname = el["name"]
         tags.append(tname)

   reading_ease =  textstat.flesch_reading_ease(rtext)
   reading_level = textstat.flesch_kincaid_grade(rtext)
   print(ident,reading_ease,reading_level)
   #tokens = nltk.word_tokenize(rtext)
   return tags,rtext
Ejemplo n.º 2
0
def fic2text(ident,master):
   textsegs = Loader.get_field(data['fics'],ident,'fic') 
   rtags = Loader.get_field(data['base'],ident,'tags')
   rtext = ""
   #tngms = []
   ttoks = Set([])
   atoks = []

   rtext = ""
   for line in textsegs:
      line = clean_line(line)
      line = proc_line(line)
      rtext += line


   #for i in range(0,len(frags)):
   #   if is_punc(frags[i]):
   #      continue
   #   print(frags[i])
   
   #master = markov.train([rtext],NUM_GRAMS,split_callback=split_line,master_dict=master)

   return rtext