Beispiel #1
0
def parsum_all(rootdir=None,
               pdfs="pdfs/",
               overview="out/overview.txt",
               texts="out/pdftexts/",
               sums="out/sums/",
               keys="out/keys/",
               wk=10,
               sk=8):
    """ parallel summarizer"""
    if rootdir:
        rootdir = os.path.abspath(rootdir) + "/"
        names = (pdfs, overview, texts, sums, keys)
        pdfs, overview, texts, sums, keys = tuple(rootdir + x for x in names)

    count = max(2, cpu_count() // 3)
    with Pool(processes=count) as pool:
        trim = len(pdfs)
        fs = [pdf for pdf in walk(dir=pdfs) if pdf[-4:].lower() == ".pdf"]
        l = len(fs)
        chunksize = 1  #max(1,int(l/(4*count)))
        print('pdf files:', l, 'processes:', count, 'chunksize:', chunksize)
        args = [(pdf, trim, texts, sums, keys, wk, sk) for pdf in fs]
        ensure_path(overview)
        with open(overview, 'w') as outf:
            for text in pool.imap(sum_one, args, chunksize=chunksize):
                if text:
                    print(text, file=outf)
Beispiel #2
0
def plot_graphs(fname, history, metric):
    nlp.ensure_path(fname)
    plt.plot(history.history[metric])
    # plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_' + metric])
    plt.savefig(fname + '.pdf', format="pdf", bbox_inches='tight')
    # plt.show()
    plt.close()
Beispiel #3
0
    def __init__(self, fname='texts/english', activation='sigmoid'):
        nlp.ensure_path("out/")
        nlp.ensure_path("pics/")
        model_file = "out/" + fname + "_model"
        if nlp.exists_file(model_file): return
        super().__init__(fname=fname)
        model = keras.Sequential()
        model.add(
            layers.Dense(128,
                         input_dim=self.hot_X.shape[1],
                         activation=activation))
        # model.add(layers.Dropout(0.5))
        model.add(
            layers.Dense(128,
                         input_dim=self.hot_X.shape[1],
                         activation=activation))
        # model.add(layers.Dropout(0.5))
        model.add(
            layers.Dense(128,
                         input_dim=self.hot_X.shape[1],
                         activation=activation))
        # model.add(layers.Dropout(0.5))
        model.add(
            layers.Dense(128,
                         input_dim=self.hot_X.shape[1],
                         activation=activation))
        # model.add(layers.Dropout(0.5))
        model.add(layers.Dense(self.hot_y.shape[1], activation='sigmoid'))
        model.summary()
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        history = model.fit(self.hot_X, self.hot_y, epochs=100, batch_size=16)

        model.save(model_file)  # to be loaded by Inferencer and used for QA

        # visualize and inform about accuracy and loss
        plot_graphs("pics/" + fname + "_loss", history, 'loss')
        plot_graphs("pics/" + fname + "_acc", history, 'accuracy')

        loss, accuracy = model.evaluate(self.hot_X, self.hot_y)
        print('Accuracy:', round(100 * accuracy, 2), ', % Loss:',
              round(100 * loss, 2), '%')
Beispiel #4
0
def summarize_all(rootdir=None,
                  pdfs="pdfs/",
                  overview="out/overview.txt",
                  texts="out/pdftexts/",
                  sums="out/sums/",
                  keys="out/keys/",
                  wk=10,
                  sk=8):
    """ sequential summarizer"""
    if rootdir:
        rootdir = os.path.abspath(rootdir) + "/"
        names = (pdfs, overview, texts, sums, keys)
        pdfs, overview, texts, sums, keys = tuple(rootdir + x for x in names)
    ensure_path(overview)
    with open(overview, 'w') as outf:
        trim = len(pdfs)
        for pdf in walk(dir=pdfs):
            text = summarize_one(pdf, trim, texts, sums, keys, wk, sk)
            if not text: continue
            print(text, file=outf)
Beispiel #5
0
def summarize_one(pdf, trim, texts, sums, keys, wk, sk):
    ''' summarizer for one document'''
    if pdf[-4:].lower() != ".pdf": return None

    name = pdf[trim:-4]

    tname0 = texts + name
    tname = texts + name + ".txt"
    sname = sums + name + ".txt"
    kname = keys + name + ".txt"

    ensure_path(tname)
    try:
        print('START processing:', pdf)
        if not exists_file(tname):
            pdf2txt(pdf, tname)
            clean_text_file(tname)

        nlp = NLP()
        nlp.from_file(tname0)
        kws, sents, _ = nlp.info(wk, sk)

        ktext = "\n".join(kws)
        ensure_path(kname)
        text2file(ktext, kname)

        stext = "\n".join(sents)
        ensure_path(sname)
        text2file(stext, sname)
        print('WRITTEN TO', sname, kname)

        text = "\n".join(
            ['FILE:', pdf, '\nSUMMARY:', stext, '\nKEYWORDS:', ktext, '\n'])
        print('DONE processing:', pdf)
        return text
    except:
        print('ERROR:', sys.exc_info()[0])
        print('processing failed on:', pdf)
        return None