def parse_full_page(self, target_url): """ Based on target website to scrape all the plain text. Will enable filter to remove the following. Remove whitespace and make sure sentences at least certain len. take care of cases where there is no modified text Args: target_url (str): Url str. """ webtext = Pattern_Parsing.get_plain_text_fr_website(target_url) if webtext == '': print 'no text' return webtext = Pattern_Parsing.replace_special_char_as_newline(webtext) modified_text = Pattern_Parsing.retain_text_with_min_sentences_len(webtext,self.min_of_words_in_sentence, join_char = '\n', limit_num_of_sentences = self.numlimit_of_sentences ) #modified_text = Pattern_Parsing.return_subset_of_text(modified_text, 0,5) #print modified_text self.parse_results_list.append(modified_text)
ww = WebCrawler(hh.merged_result_links_list) ww.set_limit_on_output_sentences(SENTENCE_LIMIT) ww.min_of_words_in_sentence = MIN_WORD_IN_SENTENCE #minimize this if want to see the frequency of words (but might need to increase the num of sentences) ww.parse_all_urls() RESULT_FILE = r'c:\data\results_file.txt' ## Dump results to text file with open(RESULT_FILE,'w') as f: for url, desc in zip(ww.list_of_urls, ww.parse_results_list): f.write('\n') f.write('#'*20) f.write('\n') f.write(url + '\n') f.write('\n') f.write(desc.encode(errors = 'ignore') + '\n' + '#'*18 + '\n') ## Enable freq note print 'Measure phrases Frequency' freq_save_filename = r'C:\data\results_file_freq.txt' most_common_phrases_list, phrases_freq_list = Pattern_Parsing.retrieve_top_freq_noun_phrases_fr_file(RESULT_FILE, 5000, 100, freq_save_filename) for (phrase, freq) in phrases_freq_list: print phrase, ' ', freq
hh.consolidated_results() print 'End Search' print 'Start crawling individual results' ww = WebCrawler(hh.merged_result_links_list) ww.set_limit_on_output_sentences(SENTENCE_LIMIT) ww.min_of_words_in_sentence = MIN_WORD_IN_SENTENCE #minimize this if want to see the frequency of words (but might need to increase the num of sentences) ww.parse_all_urls() RESULT_FILE = r'c:\data\results_file.txt' ## Dump results to text file with open(RESULT_FILE, 'w') as f: for url, desc in zip(ww.list_of_urls, ww.parse_results_list): f.write('\n') f.write('#' * 20) f.write('\n') f.write(url + '\n') f.write('\n') f.write(desc.encode(errors='ignore') + '\n' + '#' * 18 + '\n') ## Enable freq note print 'Measure phrases Frequency' freq_save_filename = r'C:\data\results_file_freq.txt' most_common_phrases_list, phrases_freq_list = Pattern_Parsing.retrieve_top_freq_noun_phrases_fr_file( RESULT_FILE, 5000, 100, freq_save_filename) for (phrase, freq) in phrases_freq_list: print phrase, ' ', freq