def parsePhysicDataSet(src_path,key_src_path,dest_path): openStaxCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression(lambda a : PhysicWebBlocks.findStartBlock(a),Filter.MANDATORY_EXP) startChapterFilter =Filter() startChapterFilter.addExpression(lambda a : PhysicWebBlocks.findStartChapterBlock(a),Filter.MANDATORY_EXP) endChapterFilter =Filter() endChapterFilter.addExpression(lambda a : PhysicWebBlocks.findEndChapterBlock(a),Filter.MANDATORY_EXP) textFilter =Filter() textFilter.addExpression(lambda a : PhysicWebBlocks.findTextBlock(a),Filter.MANDATORY_EXP) keyTermsFilter =Filter() keyTermsFilter.addExpression(lambda a : PhysicWebBlocks.findKeyTermsBlock(a),Filter.MANDATORY_EXP) # Los añadimos al parse openStaxCrawler.appendStartFilter(initFilter) openStaxCrawler.appendStartChapterFilter(startChapterFilter) openStaxCrawler.appendEndChapterFilter(endChapterFilter) openStaxCrawler.appendIncludeTextFilters(textFilter) openStaxCrawler.appendKeyTermsFilter(keyTermsFilter) #parseamos dict_files = openStaxCrawler.parse(src_path) for key in dict_files: for i , value in enumerate(dict_files[key]): f = open(dest_path +key[:6]+'_'+str(i)+'.txt','w+', encoding = "utf8") f.write('Title: '+value['chapter_title']+'\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() f = open(dest_path +key[:6]+'_'+str(i)+'.key','w+', encoding = "utf8") if ('chapter_keys' in value): for term in value['chapter_keys']: f.write(term.split(':')[0]+'\t\n') f.close() return dict_files
def parseAPCollegeOSPhysics(src_path, key_src_path, dest_path): physicCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression( lambda a: PhysicWebBlocksAPOpenStax.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a, b: PhysicWebBlocksAPOpenStax.findStartChapterBlock(a, b), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: PhysicWebBlocksAPOpenStax.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression( lambda a, b: PhysicWebBlocksAPOpenStax.findTextBlock(a, b), Filter.MANDATORY_EXP) kwFilter = Filter() kwFilter.addExpression( lambda a: PhysicWebBlocksAPOpenStax.findKeyTermsBlock(a), Filter.MANDATORY_EXP) physicCrawler.appendStartFilter(initFilter) physicCrawler.appendStartChapterFilter(startChapterFilter) physicCrawler.appendEndChapterFilter(endChapterFilter) physicCrawler.appendIncludeTextFilters(textFilter) physicCrawler.appendKeyTermsFilter(kwFilter) dict_files = physicCrawler.parse(src_path) for key in dict_files: for i, value in enumerate(dict_files[key]): txt_f_name = dest_path + key[:-4] + '_' + str(i) + '.txt' f = open(txt_f_name, 'w+', encoding="utf8") f.write('Title: ' + value['chapter_title'] + '\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() chapter_keys = [] if ('chapter_keys' in value): for term in value['chapter_keys']: chapter_keys.append(term.split(':')[0]) f = open(dest_path + key[:-4] + '_' + str(i) + '.key', 'w+', encoding="utf8") if (chapter_keys != None): for term in chapter_keys: f.write(term + '\t\n') f.close() return dict_files
def parseIGCSEHistory(src_path, key_src_path, dest_path): historyCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression(lambda a: HistoryWebBlocksIGSE.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: HistoryWebBlocksIGSE.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: HistoryWebBlocksIGSE.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression(lambda a: HistoryWebBlocksIGSE.findTextBlock(a), Filter.MANDATORY_EXP) #glossaryTermsFilter =Filter() #glossaryTermsFilter.addExpression(lambda a ,b: HistoryWebBlocksIGSE.findGlossaryBlock(a,b),Filter.MANDATORY_EXP) #pageFilter = Filter() #pageFilter.addExpression(lambda a: HistoryWebBlocksIGSE.findPageNumber(a),Filter.MANDATORY_EXP) kwFilter = Filter() kwFilter.addExpression(lambda a: HistoryWebBlocksIGSE.findKeyTermsBlock(a), Filter.MANDATORY_EXP) historyCrawler.appendStartFilter(initFilter) historyCrawler.appendStartChapterFilter(startChapterFilter) historyCrawler.appendEndChapterFilter(endChapterFilter) historyCrawler.appendIncludeTextFilters(textFilter) #historyCrawler.appendGlossaryFilter(glossaryTermsFilter) historyCrawler.appendKeyTermsFilter(kwFilter) dict_files = historyCrawler.parse(src_path) for key in dict_files: for i, value in enumerate(dict_files[key]): txt_f_name = dest_path + key[:-4] + '_' + str(i) + '.txt' f = open(txt_f_name, 'w+', encoding="utf-8") f.write('Title: ' + value['chapter_title'] + '\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() chapter_keys = [] if ('chapter_keys' in value): for term in value['chapter_keys']: chapter_keys.append(term.split(':')[0]) f = open(dest_path + key[:-4] + '_' + str(i) + '.key', 'w+', encoding="utf8") if (chapter_keys != None): for term in chapter_keys: f.write(term + '\t\n') f.close() return dict_files
def parseBiologyDataSet2(src_path, key_src_path, dest_path): biologyCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression(lambda a: BiologyWebBlocks2.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: BiologyWebBlocks2.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: BiologyWebBlocks2.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression(lambda a: BiologyWebBlocks2.findTextBlock(a), Filter.MANDATORY_EXP) keyTermsFilter = Filter() keyTermsFilter.addExpression( lambda a: BiologyWebBlocks2.findKeyTermsBlock(a), Filter.MANDATORY_EXP) biologyCrawler.appendStartFilter(initFilter) biologyCrawler.appendStartChapterFilter(startChapterFilter) biologyCrawler.appendEndChapterFilter(endChapterFilter) biologyCrawler.appendIncludeTextFilters(textFilter) biologyCrawler.appendKeyTermsFilter(keyTermsFilter) dict_files = biologyCrawler.parse(src_path) for key in dict_files: for i, value in enumerate(dict_files[key]): txt_f_name = dest_path + key[:6] + '_' + str(i) + '.txt' f = open(txt_f_name, 'w+', encoding="utf8") f.write('Title: ' + value['chapter_title'] + '\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() chapter_keys = [] if ('chapter_keys' in value): for term in value['chapter_keys']: chapter_keys.append(term.split(':')[0]) #chapter_keys.extend(checkExistsInFile2(txt_f_name,extended_key_terms)) f = open(dest_path + key[:6] + '_' + str(i) + '.key', 'w+', encoding="utf8") if (chapter_keys != None): for term in chapter_keys: f.write(term + '\t\n') f.close() return dict_files
def parseHistoryDataset2(src_path, key_src_path, dest_path): APHistUserGuide = PDFCrawler() initFilter = Filter() initFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findTextBlock(a), Filter.MANDATORY_EXP) keyTermsFilter = Filter() keyTermsFilter.addExpression( lambda a: HistoryAPUserGuideWebBlocks.findKeyTermsBlock(a), Filter.MANDATORY_EXP) APHistUserGuide.appendStartFilter(initFilter) APHistUserGuide.appendStartChapterFilter(startChapterFilter) APHistUserGuide.appendEndChapterFilter(endChapterFilter) APHistUserGuide.appendIncludeTextFilters(textFilter) APHistUserGuide.appendKeyTermsFilter(keyTermsFilter) user_guide = APHistUserGuide.parse(key_src_path) extended_key_terms = [] for key in user_guide: for i, value in enumerate(user_guide[key]): if ('chapter_keys' in value): for term in value['chapter_keys']: extended_key_terms.append( term.split(':')[0].replace('-', '').replace('–', '')) historyCrawler = PDFCrawler() initFilter = Filter() initFilter.addExpression(lambda a: HistoryHSWebBlocks.findStartBlock(a), Filter.MANDATORY_EXP) startChapterFilter = Filter() startChapterFilter.addExpression( lambda a: HistoryHSWebBlocks.findStartChapterBlock(a), Filter.MANDATORY_EXP) endChapterFilter = Filter() endChapterFilter.addExpression( lambda a: HistoryHSWebBlocks.findEndChapterBlock(a), Filter.MANDATORY_EXP) textFilter = Filter() textFilter.addExpression( lambda a, b: HistoryHSWebBlocks.findTextBlock(a, b), Filter.MANDATORY_EXP) keyTermsFilter = Filter() keyTermsFilter.addExpression( lambda a: HistoryHSWebBlocks.findKeyTermsBlock(a), Filter.MANDATORY_EXP) historyCrawler.appendStartFilter(initFilter) historyCrawler.appendStartChapterFilter(startChapterFilter) historyCrawler.appendEndChapterFilter(endChapterFilter) historyCrawler.appendIncludeTextFilters(textFilter) historyCrawler.appendKeyTermsFilter(keyTermsFilter) dict_files = historyCrawler.parse(src_path) for key in dict_files: for i, value in enumerate(dict_files[key]): txt_f_name = dest_path + key[:-4] + '_' + str(i) + '.txt' f = open(txt_f_name, 'w+', encoding="utf8") f.write('Title: ' + value['chapter_title'] + '\t\n') f.write(nltk_clean(value['chapter_text'])) f.close() chapter_keys = [] if ('chapter_keys' in value): for term in value['chapter_keys']: chapter_keys.append(term.split(':')[0]) chapter_keys.extend( checkExistsInFile2(txt_f_name, extended_key_terms, n_gram_reduced=True, min_length=5)) f = open(dest_path + key[:-4] + '_' + str(i) + '.key', 'w+', encoding="utf8") if (chapter_keys != None): for term in chapter_keys: f.write(term + '\t\n') f.close() return dict_files