コード例 #1
0
def process_file(args):
    in_file, i, nfiles, out_file = args

    logger.info('Will process file %s (%d/%d)', in_file, i + 1, nfiles)

    t = coh.Text(filepath=in_file, encoding='iso-8859-1')

    deleted = 0
    try:
        while not t.paragraphs[0][-1] in ('.', ':', '?', '!'):
            del t.paragraphs[0]
            deleted += 1
    except IndexError:
        logger.fatal('Ignored file %s', in_file)
        return

    logger.info('Deleted %d lines', deleted)

    sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs])

    tokens = [[word.lower() for word in word_tokenize(sent)]
              for sent in sentences]

    joined_sentences = '\n'.join([' '.join(sentence) for sentence in tokens])
    out_file.write(joined_sentences)
コード例 #2
0
def process_file(args):
    in_file, i, nfiles, out_dir = args

    logger.info('Will process file %s (%d/%d)', in_file, i + 1, nfiles)

    t = coh.Text(filepath=in_file, encoding='utf-16le')

    deleted = 0
    try:
        while t.paragraphs[1].startswith('{'):
            del t.paragraphs[1]
            deleted += 1
    except IndexError:
        logger.fatal('Ignored file %s', in_file)
        return

    logger.info('Deleted %d lines', deleted)

    sentences = chain.from_iterable(
        [senter.tokenize(p) for p in t.paragraphs])

    tokens = [[word.lower() for word in word_tokenize(sent)
               if word.lower() not in stopwords and word.isalpha()]
              for sent in sentences]

    filename = os.path.basename(in_file)
    dirname = os.path.basename(os.path.dirname(in_file))

    with codecs.open(os.path.join(out_dir, dirname, filename),
                     encoding='utf-8', mode='w') as out_file:
        joined_sentences = '\n'.join([' '.join(sentence)
                                      for sentence in tokens])
        out_file.write(joined_sentences)
コード例 #3
0
def process_file(args):
    in_file, i, nfiles, in_dir, out_dir = args

    logger.info('Will process file %s (%d/%d)', in_file, i + 1, nfiles)

    t = coh.Text(filepath=os.path.join(in_dir, in_file))

    deleted = 0
    while not t.paragraphs[0][-1] in ('.', ':', '?', '!'):
        del t.paragraphs[0]
        deleted += 1

    logger.info('Deleted %d lines', deleted)

    sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs])

    tokens = [[
        word.lower() for word in word_tokenize(sent)
        if word.lower() not in stopwords and word.isalpha()
    ] for sent in sentences]

    with open(os.path.join(out_dir, in_file), 'w') as out_file:
        joined_sentences = '\n'.join(
            [' '.join(sentence) for sentence in tokens])
        out_file.write(joined_sentences)
コード例 #4
0
def process_file(args):
    in_file, i, nfiles, out_file = args

    logger.info('Will process file %s (%d/%d)', in_file, i + 1, nfiles)

    t = coh.Text(filepath=in_file, encoding='utf-16le')

    first = 1
    while not t.paragraphs[first].startswith('{'):
        first += 1

    deleted = 0
    try:
        while t.paragraphs[first].startswith('{'):
            del t.paragraphs[first]
            deleted += 1
    except IndexError:
        logger.fatal('Ignored file %s', in_file)
        return 1

    logger.info('Deleted %d lines', deleted)

    sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs])

    tokens = [[word.lower() for word in word_tokenize(sent)]
              for sent in sentences]

    joined_sentences = '\n'.join([' '.join(sentence) for sentence in tokens])

    out_file.write(joined_sentences + '\n')

    return 0
コード例 #5
0
def process_file(args):
    wiki_file, i, nfiles, wiki_dir, out_file = args

    logger.info('Will process file %s (%d/%d)', wiki_file, i + 1, nfiles)

    t = coh.Text(filepath=os.path.join(wiki_dir, wiki_file))

    sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs])

    tokens = [[word.lower() for word in word_tokenize(sent)]
              for sent in sentences]

    joined_sentences = '\n'.join([' '.join(sentence) for sentence in tokens])

    out_file.write(joined_sentences + '\n')
コード例 #6
0
def process_file(args):
    wiki_file, i, nfiles, wiki_dir, out_dir = args

    logger.info('Will process file %s (%d/%d)', wiki_file, i + 1, nfiles)

    t = coh.Text(filepath=os.path.join(wiki_dir, wiki_file))

    sentences = chain.from_iterable([senter.tokenize(p) for p in t.paragraphs])

    tokens = [[
        word.lower() for word in word_tokenize(sent)
        if word.lower() not in stopwords and word.isalpha()
    ] for sent in sentences]

    with open(os.path.join(out_dir, wiki_file), 'w') as out_file:
        joined_sentences = '\n'.join(
            [' '.join(sentence) for sentence in tokens])
        out_file.write(joined_sentences)
コード例 #7
0
    coh.Hypernyms(),
    coh.Tokens(),
    coh.Connectives(),
    coh.Ambiguity(),
    coh.SyntacticalComplexity(),
    coh.SemanticDensity(),
    coh.Constituents(),
    coh.Anaphoras(),
    coh.Coreference(),
    coh.Lsa(),
    # coh.Disfluencies(),
])

print("Carregando métricas...")

t = coh.Text(docs_new[0])

r = all_metrics.values_for_text(t)

print(r.as_table())

print("                     Possível classificação do nível escolar")
#predicted = classifier_liblinear.predict(r.as_array())
#predicted = classifier_linear.predict(r.as_array())
predicted = classifier_rbf.predict(r.as_array())
#print("predicted:",predicted)
for doc, category in zip(docs_new, predicted):
    #print("                     Num_Rotulo:",category, categories[category])
    print(categories[category])

print("+--------------------------------------------+---------------------+")