def process_file(args): in_file, i, nfiles, out_file = args logger.info('Will process file %s (%d/%d)', in_file, i + 1, nfiles) t = coh.Text(filepath=in_file, encoding='iso-8859-1') deleted = 0 try: while not t.paragraphs[0][-1] in ('.', ':', '?', '!'): del t.paragraphs[0] deleted += 1 except IndexError: logger.fatal('Ignored file %s', in_file) return logger.info('Deleted %d lines', deleted) sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs]) tokens = [[word.lower() for word in word_tokenize(sent)] for sent in sentences] joined_sentences = '\n'.join([' '.join(sentence) for sentence in tokens]) out_file.write(joined_sentences)
def process_file(args): in_file, i, nfiles, out_dir = args logger.info('Will process file %s (%d/%d)', in_file, i + 1, nfiles) t = coh.Text(filepath=in_file, encoding='utf-16le') deleted = 0 try: while t.paragraphs[1].startswith('{'): del t.paragraphs[1] deleted += 1 except IndexError: logger.fatal('Ignored file %s', in_file) return logger.info('Deleted %d lines', deleted) sentences = chain.from_iterable( [senter.tokenize(p) for p in t.paragraphs]) tokens = [[word.lower() for word in word_tokenize(sent) if word.lower() not in stopwords and word.isalpha()] for sent in sentences] filename = os.path.basename(in_file) dirname = os.path.basename(os.path.dirname(in_file)) with codecs.open(os.path.join(out_dir, dirname, filename), encoding='utf-8', mode='w') as out_file: joined_sentences = '\n'.join([' '.join(sentence) for sentence in tokens]) out_file.write(joined_sentences)
def process_file(args): in_file, i, nfiles, in_dir, out_dir = args logger.info('Will process file %s (%d/%d)', in_file, i + 1, nfiles) t = coh.Text(filepath=os.path.join(in_dir, in_file)) deleted = 0 while not t.paragraphs[0][-1] in ('.', ':', '?', '!'): del t.paragraphs[0] deleted += 1 logger.info('Deleted %d lines', deleted) sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs]) tokens = [[ word.lower() for word in word_tokenize(sent) if word.lower() not in stopwords and word.isalpha() ] for sent in sentences] with open(os.path.join(out_dir, in_file), 'w') as out_file: joined_sentences = '\n'.join( [' '.join(sentence) for sentence in tokens]) out_file.write(joined_sentences)
def process_file(args): in_file, i, nfiles, out_file = args logger.info('Will process file %s (%d/%d)', in_file, i + 1, nfiles) t = coh.Text(filepath=in_file, encoding='utf-16le') first = 1 while not t.paragraphs[first].startswith('{'): first += 1 deleted = 0 try: while t.paragraphs[first].startswith('{'): del t.paragraphs[first] deleted += 1 except IndexError: logger.fatal('Ignored file %s', in_file) return 1 logger.info('Deleted %d lines', deleted) sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs]) tokens = [[word.lower() for word in word_tokenize(sent)] for sent in sentences] joined_sentences = '\n'.join([' '.join(sentence) for sentence in tokens]) out_file.write(joined_sentences + '\n') return 0
def process_file(args): wiki_file, i, nfiles, wiki_dir, out_file = args logger.info('Will process file %s (%d/%d)', wiki_file, i + 1, nfiles) t = coh.Text(filepath=os.path.join(wiki_dir, wiki_file)) sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs]) tokens = [[word.lower() for word in word_tokenize(sent)] for sent in sentences] joined_sentences = '\n'.join([' '.join(sentence) for sentence in tokens]) out_file.write(joined_sentences + '\n')
def process_file(args): wiki_file, i, nfiles, wiki_dir, out_dir = args logger.info('Will process file %s (%d/%d)', wiki_file, i + 1, nfiles) t = coh.Text(filepath=os.path.join(wiki_dir, wiki_file)) sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs]) tokens = [[ word.lower() for word in word_tokenize(sent) if word.lower() not in stopwords and word.isalpha() ] for sent in sentences] with open(os.path.join(out_dir, wiki_file), 'w') as out_file: joined_sentences = '\n'.join( [' '.join(sentence) for sentence in tokens]) out_file.write(joined_sentences)
coh.Hypernyms(), coh.Tokens(), coh.Connectives(), coh.Ambiguity(), coh.SyntacticalComplexity(), coh.SemanticDensity(), coh.Constituents(), coh.Anaphoras(), coh.Coreference(), coh.Lsa(), # coh.Disfluencies(), ]) print("Carregando métricas...") t = coh.Text(docs_new[0]) r = all_metrics.values_for_text(t) print(r.as_table()) print(" Possível classificação do nível escolar") #predicted = classifier_liblinear.predict(r.as_array()) #predicted = classifier_linear.predict(r.as_array()) predicted = classifier_rbf.predict(r.as_array()) #print("predicted:",predicted) for doc, category in zip(docs_new, predicted): #print(" Num_Rotulo:",category, categories[category]) print(categories[category]) print("+--------------------------------------------+---------------------+")