def parsePhysicDataSet(src_path, key_src_path, dest_path): openStaxCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression(lambda a: PhysicWebBlocks.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: PhysicWebBlocks.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: PhysicWebBlocks.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression(lambda a: PhysicWebBlocks.findTextBlock(a), Filter.MANDATORY_EXP) keyTermsFilter = Filter() keyTermsFilter.addExpression( lambda a: PhysicWebBlocks.findKeyTermsBlock(a), Filter.MANDATORY_EXP) # Los añadimos al parse openStaxCrawler.appendStartFilter(initFilter) openStaxCrawler.appendStartChapterFilter(startChapterFilter) openStaxCrawler.appendEndChapterFilter(endChapterFilter) openStaxCrawler.appendIncludeTextFilters(textFilter) openStaxCrawler.appendKeyTermsFilter(keyTermsFilter) #parseamos dict_files = openStaxCrawler.parse(src_path) for key in dict_files: for i, value in enumerate(dict_files[key]): f = open(dest_path + key[:6] + '_' + str(i) + '.txt', 'w+', encoding="utf8") f.write('Title: ' + value['chapter_title'] + '\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() f = open(dest_path + key[:6] + '_' + str(i) + '.key', 'w+', encoding="utf8") if ('chapter_keys' in value): for term in value['chapter_keys']: f.write(term.split(':')[0] + '\t\n') f.close() return dict_files
def parseHumanGeography(src_path, key_src_path, dest_path): hToToCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression(lambda a: GeographyWebBlocks.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: GeographyWebBlocks.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: GeographyWebBlocks.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression(lambda a: GeographyWebBlocks.findTextBlock(a), Filter.MANDATORY_EXP) kwFilter = Filter() kwFilter.addExpression(lambda a: GeographyWebBlocks.findKeyTermsBlock(a), Filter.MANDATORY_EXP) hToToCrawler.appendStartFilter(initFilter) hToToCrawler.appendStartChapterFilter(startChapterFilter) hToToCrawler.appendEndChapterFilter(endChapterFilter) hToToCrawler.appendIncludeTextFilters(textFilter) hToToCrawler.appendKeyTermsFilter(kwFilter) dict_files = hToToCrawler.parse(src_path) for key in dict_files: for i, value in enumerate(dict_files[key]): txt_f_name = dest_path + key[:-4] + '_' + str(i) + '.txt' f = open(txt_f_name, 'w+', encoding="utf8") f.write('Title: ' + value['chapter_title'] + '\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() chapter_keys = [] if ('chapter_keys' in value): for term in value['chapter_keys']: chapter_keys.append(term.split(':')[0]) f = open(dest_path + key[:-4] + '_' + str(i) + '.key', 'w+', encoding="utf8") if (chapter_keys != None): for term in chapter_keys: if (len(term.strip()) > 0): f.write(term + '\t\n') f.close() return dict_files
def parseBiologyDataSet2(src_path, key_src_path, dest_path): biologyCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression(lambda a: BiologyWebBlocks2.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: BiologyWebBlocks2.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: BiologyWebBlocks2.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression(lambda a: BiologyWebBlocks2.findTextBlock(a), Filter.MANDATORY_EXP) keyTermsFilter = Filter() keyTermsFilter.addExpression( lambda a: BiologyWebBlocks2.findKeyTermsBlock(a), Filter.MANDATORY_EXP) biologyCrawler.appendStartFilter(initFilter) biologyCrawler.appendStartChapterFilter(startChapterFilter) biologyCrawler.appendEndChapterFilter(endChapterFilter) biologyCrawler.appendIncludeTextFilters(textFilter) biologyCrawler.appendKeyTermsFilter(keyTermsFilter) dict_files = biologyCrawler.parse(src_path) for key in dict_files: for i, value in enumerate(dict_files[key]): txt_f_name = dest_path + key[:6] + '_' + str(i) + '.txt' f = open(txt_f_name, 'w+', encoding="utf8") f.write('Title: ' + value['chapter_title'] + '\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() chapter_keys = [] if ('chapter_keys' in value): for term in value['chapter_keys']: chapter_keys.append(term.split(':')[0]) #chapter_keys.extend(checkExistsInFile2(txt_f_name,extended_key_terms)) f = open(dest_path + key[:6] + '_' + str(i) + '.key', 'w+', encoding="utf8") if (chapter_keys != None): for term in chapter_keys: f.write(term + '\t\n') f.close() return dict_files
def parseIGCSEPhysics(src_path, key_src_path, dest_path): physicCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression(lambda a: PhysicsWebBlocksIGCSE.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: PhysicsWebBlocksIGCSE.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: PhysicsWebBlocksIGCSE.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression(lambda a: PhysicsWebBlocksIGCSE.findTextBlock(a), Filter.MANDATORY_EXP) glossaryTermsFilter = Filter() glossaryTermsFilter.addExpression( lambda a, b: PhysicsWebBlocksIGCSE.findGlossaryBlock(a, b), Filter.MANDATORY_EXP) pageFilter = Filter() pageFilter.addExpression(lambda a: PhysicsWebBlocksIGCSE.findPageNumber(a), Filter.MANDATORY_EXP) physicCrawler.appendStartFilter(initFilter) physicCrawler.appendStartChapterFilter(startChapterFilter) physicCrawler.appendEndChapterFilter(endChapterFilter) physicCrawler.appendIncludeTextFilters(textFilter) physicCrawler.appendGlossaryFilter(glossaryTermsFilter) physicCrawler.appendPageFilter(pageFilter) dict_files = physicCrawler.parse(src_path) for key in dict_files: for i, value in enumerate(dict_files[key]): txt_f_name = dest_path + key[:-4] + '_' + str(i) + '.txt' f = open(txt_f_name, 'w+', encoding="utf8") f.write('Title: ' + value['chapter_title'] + '\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() chapter_keys = [] if ('chapter_keys' in value): for term in value['chapter_keys']: chapter_keys.append(term.split(':')[0]) f = open(dest_path + key[:-4] + '_' + str(i) + '.key', 'w+', encoding="utf8") if (chapter_keys != None): for term in chapter_keys: f.write(term + '\t\n') f.close() return dict_files
def parseHistoryDataset2(src_path, key_src_path, dest_path): APHistUserGuide = PDFCrawler() initFilter = Filter() initFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findTextBlock(a), Filter.MANDATORY_EXP) keyTermsFilter = Filter() keyTermsFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findKeyTermsBlock(a), Filter.MANDATORY_EXP) APHistUserGuide.appendStartFilter(initFilter) APHistUserGuide.appendStartChapterFilter(startChapterFilter) APHistUserGuide.appendEndChapterFilter(endChapterFilter) APHistUserGuide.appendIncludeTextFilters(textFilter) APHistUserGuide.appendKeyTermsFilter(keyTermsFilter) user_guide = APHistUserGuide.parse(key_src_path) extended_key_terms = [] for key in user_guide: for i, value in enumerate(user_guide[key]): if ('chapter_keys' in value): for term in value['chapter_keys']: extended_key_terms.append( term.split(':')[0].replace('-', '').replace('–', '')) historyCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression(lambda a: HistoryHSWebBlocks.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: HistoryHSWebBlocks.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: HistoryHSWebBlocks.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression( lambda a, b: HistoryHSWebBlocks.findTextBlock(a, b), Filter.MANDATORY_EXP) keyTermsFilter = Filter() keyTermsFilter.addExpression( lambda a: HistoryHSWebBlocks.findKeyTermsBlock(a), Filter.MANDATORY_EXP) historyCrawler.appendStartFilter(initFilter) historyCrawler.appendStartChapterFilter(startChapterFilter) historyCrawler.appendEndChapterFilter(endChapterFilter) historyCrawler.appendIncludeTextFilters(textFilter) historyCrawler.appendKeyTermsFilter(keyTermsFilter) dict_files = historyCrawler.parse(src_path) for key in dict_files: for i, value in enumerate(dict_files[key]): txt_f_name = dest_path + key[:-4] + '_' + str(i) + '.txt' f = open(txt_f_name, 'w+', encoding="utf8") f.write('Title: ' + value['chapter_title'] + '\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() chapter_keys = [] if ('chapter_keys' in value): for term in value['chapter_keys']: chapter_keys.append(term.split(':')[0]) chapter_keys.extend( checkExistsInFile2(txt_f_name, extended_key_terms, n_gram_reduced=True, min_length=5)) f = open(dest_path + key[:-4] + '_' + str(i) + '.key', 'w+', encoding="utf8") if (chapter_keys != None): for term in chapter_keys: f.write(term + '\t\n') f.close() return dict_files