def parse_full_page(self, target_url):
     """ Based on target website to scrape all the plain text.
         Will enable filter to remove the following.
         Remove whitespace and make sure sentences at least certain len.
         take care of cases where there is no modified text
         Args:
             target_url (str): Url str.
     
     """
     webtext = Pattern_Parsing.get_plain_text_fr_website(target_url)
     if webtext == '':
         print 'no text'
         return
     webtext = Pattern_Parsing.replace_special_char_as_newline(webtext)
     modified_text = Pattern_Parsing.retain_text_with_min_sentences_len(webtext,self.min_of_words_in_sentence, join_char = '\n', limit_num_of_sentences = self.numlimit_of_sentences )
     #modified_text = Pattern_Parsing.return_subset_of_text(modified_text, 0,5)
     #print modified_text
     self.parse_results_list.append(modified_text)
    
    ww = WebCrawler(hh.merged_result_links_list)
    ww.set_limit_on_output_sentences(SENTENCE_LIMIT)
    ww.min_of_words_in_sentence = MIN_WORD_IN_SENTENCE #minimize this if want to see the frequency of words (but might need to increase the num of sentences)
    ww.parse_all_urls()

    RESULT_FILE = r'c:\data\results_file.txt'
    ## Dump results to text file
    with open(RESULT_FILE,'w') as f:
        for url, desc in zip(ww.list_of_urls, ww.parse_results_list):
            f.write('\n')
            f.write('#'*20)
            f.write('\n')
            f.write(url + '\n')
            f.write('\n')
            f.write(desc.encode(errors = 'ignore') + '\n' + '#'*18 + '\n')

    ## Enable freq note
    print 'Measure phrases Frequency'
    freq_save_filename =  r'C:\data\results_file_freq.txt'

    most_common_phrases_list, phrases_freq_list = Pattern_Parsing.retrieve_top_freq_noun_phrases_fr_file(RESULT_FILE, 5000, 100, freq_save_filename)

    for (phrase, freq) in phrases_freq_list:
        print phrase, '  ', freq





Beispiel #3
0
    hh.consolidated_results()

    print 'End Search'
    print 'Start crawling individual results'

    ww = WebCrawler(hh.merged_result_links_list)
    ww.set_limit_on_output_sentences(SENTENCE_LIMIT)
    ww.min_of_words_in_sentence = MIN_WORD_IN_SENTENCE  #minimize this if want to see the frequency of words (but might need to increase the num of sentences)
    ww.parse_all_urls()

    RESULT_FILE = r'c:\data\results_file.txt'
    ## Dump results to text file
    with open(RESULT_FILE, 'w') as f:
        for url, desc in zip(ww.list_of_urls, ww.parse_results_list):
            f.write('\n')
            f.write('#' * 20)
            f.write('\n')
            f.write(url + '\n')
            f.write('\n')
            f.write(desc.encode(errors='ignore') + '\n' + '#' * 18 + '\n')

    ## Enable freq note
    print 'Measure phrases Frequency'
    freq_save_filename = r'C:\data\results_file_freq.txt'

    most_common_phrases_list, phrases_freq_list = Pattern_Parsing.retrieve_top_freq_noun_phrases_fr_file(
        RESULT_FILE, 5000, 100, freq_save_filename)

    for (phrase, freq) in phrases_freq_list:
        print phrase, '  ', freq