Python PDFCrawler Exemples, crawlers.PDFCrawler.PDFCrawler Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : mainCrawler_20220425070227.py Projet : cidinene/KWExtractorTransformers

def parsePhysicDataSet(src_path, key_src_path, dest_path):
    openStaxCrawler = PDFCrawler()
    initFilter = Filter()
    initFilter.addExpression(lambda a: PhysicWebBlocks.findStartBlock(a),
                             Filter.MANDATORY_EXP)
    startChapterFilter = Filter()
    startChapterFilter.addExpression(
        lambda a: PhysicWebBlocks.findStartChapterBlock(a),
        Filter.MANDATORY_EXP)
    endChapterFilter = Filter()
    endChapterFilter.addExpression(
        lambda a: PhysicWebBlocks.findEndChapterBlock(a), Filter.MANDATORY_EXP)
    textFilter = Filter()
    textFilter.addExpression(lambda a: PhysicWebBlocks.findTextBlock(a),
                             Filter.MANDATORY_EXP)
    keyTermsFilter = Filter()
    keyTermsFilter.addExpression(
        lambda a: PhysicWebBlocks.findKeyTermsBlock(a), Filter.MANDATORY_EXP)

    # Los añadimos al parse
    openStaxCrawler.appendStartFilter(initFilter)
    openStaxCrawler.appendStartChapterFilter(startChapterFilter)
    openStaxCrawler.appendEndChapterFilter(endChapterFilter)
    openStaxCrawler.appendIncludeTextFilters(textFilter)
    openStaxCrawler.appendKeyTermsFilter(keyTermsFilter)
    #parseamos
    dict_files = openStaxCrawler.parse(src_path)

    for key in dict_files:
        for i, value in enumerate(dict_files[key]):
            f = open(dest_path + key[:6] + '_' + str(i) + '.txt',
                     'w+',
                     encoding="utf8")
            f.write('Title: ' + value['chapter_title'] + '\t\n')
            f.write(nltk_clean(value['chapter_text']))
            f.close()
            f = open(dest_path + key[:6] + '_' + str(i) + '.key',
                     'w+',
                     encoding="utf8")
            if ('chapter_keys' in value):
                for term in value['chapter_keys']:
                    f.write(term.split(':')[0] + '\t\n')
            f.close()
    return dict_files

Exemple #2

0

Afficher le fichier

Fichier : mainCrawler_20220425070227.py Projet : cidinene/KWExtractorTransformers

def parseHumanGeography(src_path, key_src_path, dest_path):

    hToToCrawler = PDFCrawler()
    initFilter = Filter()
    initFilter.addExpression(lambda a: GeographyWebBlocks.findStartBlock(a),
                             Filter.MANDATORY_EXP)
    startChapterFilter = Filter()
    startChapterFilter.addExpression(
        lambda a: GeographyWebBlocks.findStartChapterBlock(a),
        Filter.MANDATORY_EXP)
    endChapterFilter = Filter()
    endChapterFilter.addExpression(
        lambda a: GeographyWebBlocks.findEndChapterBlock(a),
        Filter.MANDATORY_EXP)
    textFilter = Filter()
    textFilter.addExpression(lambda a: GeographyWebBlocks.findTextBlock(a),
                             Filter.MANDATORY_EXP)
    kwFilter = Filter()
    kwFilter.addExpression(lambda a: GeographyWebBlocks.findKeyTermsBlock(a),
                           Filter.MANDATORY_EXP)

    hToToCrawler.appendStartFilter(initFilter)
    hToToCrawler.appendStartChapterFilter(startChapterFilter)
    hToToCrawler.appendEndChapterFilter(endChapterFilter)
    hToToCrawler.appendIncludeTextFilters(textFilter)
    hToToCrawler.appendKeyTermsFilter(kwFilter)

    dict_files = hToToCrawler.parse(src_path)

    for key in dict_files:
        for i, value in enumerate(dict_files[key]):
            txt_f_name = dest_path + key[:-4] + '_' + str(i) + '.txt'
            f = open(txt_f_name, 'w+', encoding="utf8")
            f.write('Title: ' + value['chapter_title'] + '\t\n')
            f.write(nltk_clean(value['chapter_text']))
            f.close()
            chapter_keys = []
            if ('chapter_keys' in value):
                for term in value['chapter_keys']:
                    chapter_keys.append(term.split(':')[0])
            f = open(dest_path + key[:-4] + '_' + str(i) + '.key',
                     'w+',
                     encoding="utf8")
            if (chapter_keys != None):
                for term in chapter_keys:
                    if (len(term.strip()) > 0):
                        f.write(term + '\t\n')
            f.close()
    return dict_files

Exemple #3

0

Afficher le fichier

Fichier : mainCrawler_20220425070227.py Projet : cidinene/KWExtractorTransformers

def parseBiologyDataSet2(src_path, key_src_path, dest_path):

    biologyCrawler = PDFCrawler()
    initFilter = Filter()
    initFilter.addExpression(lambda a: BiologyWebBlocks2.findStartBlock(a),
                             Filter.MANDATORY_EXP)
    startChapterFilter = Filter()
    startChapterFilter.addExpression(
        lambda a: BiologyWebBlocks2.findStartChapterBlock(a),
        Filter.MANDATORY_EXP)
    endChapterFilter = Filter()
    endChapterFilter.addExpression(
        lambda a: BiologyWebBlocks2.findEndChapterBlock(a),
        Filter.MANDATORY_EXP)
    textFilter = Filter()
    textFilter.addExpression(lambda a: BiologyWebBlocks2.findTextBlock(a),
                             Filter.MANDATORY_EXP)
    keyTermsFilter = Filter()
    keyTermsFilter.addExpression(
        lambda a: BiologyWebBlocks2.findKeyTermsBlock(a), Filter.MANDATORY_EXP)

    biologyCrawler.appendStartFilter(initFilter)
    biologyCrawler.appendStartChapterFilter(startChapterFilter)
    biologyCrawler.appendEndChapterFilter(endChapterFilter)
    biologyCrawler.appendIncludeTextFilters(textFilter)
    biologyCrawler.appendKeyTermsFilter(keyTermsFilter)
    dict_files = biologyCrawler.parse(src_path)
    for key in dict_files:
        for i, value in enumerate(dict_files[key]):
            txt_f_name = dest_path + key[:6] + '_' + str(i) + '.txt'
            f = open(txt_f_name, 'w+', encoding="utf8")
            f.write('Title: ' + value['chapter_title'] + '\t\n')
            f.write(nltk_clean(value['chapter_text']))
            f.close()
            chapter_keys = []
            if ('chapter_keys' in value):
                for term in value['chapter_keys']:
                    chapter_keys.append(term.split(':')[0])
            #chapter_keys.extend(checkExistsInFile2(txt_f_name,extended_key_terms))
            f = open(dest_path + key[:6] + '_' + str(i) + '.key',
                     'w+',
                     encoding="utf8")

            if (chapter_keys != None):
                for term in chapter_keys:
                    f.write(term + '\t\n')
            f.close()
    return dict_files

Exemple #4

0

Afficher le fichier

Fichier : mainCrawler_20220425070227.py Projet : cidinene/KWExtractorTransformers

def parseIGCSEPhysics(src_path, key_src_path, dest_path):

    physicCrawler = PDFCrawler()
    initFilter = Filter()
    initFilter.addExpression(lambda a: PhysicsWebBlocksIGCSE.findStartBlock(a),
                             Filter.MANDATORY_EXP)
    startChapterFilter = Filter()
    startChapterFilter.addExpression(
        lambda a: PhysicsWebBlocksIGCSE.findStartChapterBlock(a),
        Filter.MANDATORY_EXP)
    endChapterFilter = Filter()
    endChapterFilter.addExpression(
        lambda a: PhysicsWebBlocksIGCSE.findEndChapterBlock(a),
        Filter.MANDATORY_EXP)
    textFilter = Filter()
    textFilter.addExpression(lambda a: PhysicsWebBlocksIGCSE.findTextBlock(a),
                             Filter.MANDATORY_EXP)
    glossaryTermsFilter = Filter()
    glossaryTermsFilter.addExpression(
        lambda a, b: PhysicsWebBlocksIGCSE.findGlossaryBlock(a, b),
        Filter.MANDATORY_EXP)
    pageFilter = Filter()
    pageFilter.addExpression(lambda a: PhysicsWebBlocksIGCSE.findPageNumber(a),
                             Filter.MANDATORY_EXP)

    physicCrawler.appendStartFilter(initFilter)
    physicCrawler.appendStartChapterFilter(startChapterFilter)
    physicCrawler.appendEndChapterFilter(endChapterFilter)
    physicCrawler.appendIncludeTextFilters(textFilter)
    physicCrawler.appendGlossaryFilter(glossaryTermsFilter)
    physicCrawler.appendPageFilter(pageFilter)

    dict_files = physicCrawler.parse(src_path)
    for key in dict_files:
        for i, value in enumerate(dict_files[key]):
            txt_f_name = dest_path + key[:-4] + '_' + str(i) + '.txt'
            f = open(txt_f_name, 'w+', encoding="utf8")
            f.write('Title: ' + value['chapter_title'] + '\t\n')
            f.write(nltk_clean(value['chapter_text']))
            f.close()
            chapter_keys = []
            if ('chapter_keys' in value):
                for term in value['chapter_keys']:
                    chapter_keys.append(term.split(':')[0])
            f = open(dest_path + key[:-4] + '_' + str(i) + '.key',
                     'w+',
                     encoding="utf8")
            if (chapter_keys != None):
                for term in chapter_keys:
                    f.write(term + '\t\n')
            f.close()
    return dict_files

Exemple #5

0

Afficher le fichier

Fichier : mainCrawler_20220425070227.py Projet : cidinene/KWExtractorTransformers

def parseHistoryDataset2(src_path, key_src_path, dest_path):

    APHistUserGuide = PDFCrawler()
    initFilter = Filter()
    initFilter.addExpression(
        lambda a: HistoryAPUserGuideWebBlocks.findStartBlock(a),
        Filter.MANDATORY_EXP)
    startChapterFilter = Filter()
    startChapterFilter.addExpression(
        lambda a: HistoryAPUserGuideWebBlocks.findStartChapterBlock(a),
        Filter.MANDATORY_EXP)
    endChapterFilter = Filter()
    endChapterFilter.addExpression(
        lambda a: HistoryAPUserGuideWebBlocks.findEndChapterBlock(a),
        Filter.MANDATORY_EXP)
    textFilter = Filter()
    textFilter.addExpression(
        lambda a: HistoryAPUserGuideWebBlocks.findTextBlock(a),
        Filter.MANDATORY_EXP)
    keyTermsFilter = Filter()
    keyTermsFilter.addExpression(
        lambda a: HistoryAPUserGuideWebBlocks.findKeyTermsBlock(a),
        Filter.MANDATORY_EXP)

    APHistUserGuide.appendStartFilter(initFilter)
    APHistUserGuide.appendStartChapterFilter(startChapterFilter)
    APHistUserGuide.appendEndChapterFilter(endChapterFilter)
    APHistUserGuide.appendIncludeTextFilters(textFilter)
    APHistUserGuide.appendKeyTermsFilter(keyTermsFilter)

    user_guide = APHistUserGuide.parse(key_src_path)
    extended_key_terms = []
    for key in user_guide:
        for i, value in enumerate(user_guide[key]):
            if ('chapter_keys' in value):
                for term in value['chapter_keys']:
                    extended_key_terms.append(
                        term.split(':')[0].replace('-', '').replace('–', ''))

    historyCrawler = PDFCrawler()
    initFilter = Filter()
    initFilter.addExpression(lambda a: HistoryHSWebBlocks.findStartBlock(a),
                             Filter.MANDATORY_EXP)
    startChapterFilter = Filter()
    startChapterFilter.addExpression(
        lambda a: HistoryHSWebBlocks.findStartChapterBlock(a),
        Filter.MANDATORY_EXP)
    endChapterFilter = Filter()
    endChapterFilter.addExpression(
        lambda a: HistoryHSWebBlocks.findEndChapterBlock(a),
        Filter.MANDATORY_EXP)
    textFilter = Filter()
    textFilter.addExpression(
        lambda a, b: HistoryHSWebBlocks.findTextBlock(a, b),
        Filter.MANDATORY_EXP)
    keyTermsFilter = Filter()
    keyTermsFilter.addExpression(
        lambda a: HistoryHSWebBlocks.findKeyTermsBlock(a),
        Filter.MANDATORY_EXP)

    historyCrawler.appendStartFilter(initFilter)
    historyCrawler.appendStartChapterFilter(startChapterFilter)
    historyCrawler.appendEndChapterFilter(endChapterFilter)
    historyCrawler.appendIncludeTextFilters(textFilter)
    historyCrawler.appendKeyTermsFilter(keyTermsFilter)
    dict_files = historyCrawler.parse(src_path)

    for key in dict_files:
        for i, value in enumerate(dict_files[key]):
            txt_f_name = dest_path + key[:-4] + '_' + str(i) + '.txt'
            f = open(txt_f_name, 'w+', encoding="utf8")
            f.write('Title: ' + value['chapter_title'] + '\t\n')
            f.write(nltk_clean(value['chapter_text']))
            f.close()
            chapter_keys = []
            if ('chapter_keys' in value):
                for term in value['chapter_keys']:
                    chapter_keys.append(term.split(':')[0])
            chapter_keys.extend(
                checkExistsInFile2(txt_f_name,
                                   extended_key_terms,
                                   n_gram_reduced=True,
                                   min_length=5))
            f = open(dest_path + key[:-4] + '_' + str(i) + '.key',
                     'w+',
                     encoding="utf8")
            if (chapter_keys != None):
                for term in chapter_keys:
                    f.write(term + '\t\n')
            f.close()
    return dict_files